DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
The original DataCamp_Insights_v001 document has been split for this document:
There are a few nuggest from within these beginning modules, including:
Below is some sample code showing examples for the generic statements:
# Factors
xRaw = c("High", "High", "Low", "Low", "Medium", "Very High", "Low")
xFactorNon = factor(xRaw, levels=c("Low", "Medium", "High", "Very High"))
xFactorNon
## [1] High High Low Low Medium Very High Low
## Levels: Low Medium High Very High
xFactorNon[xFactorNon == "High"] > xFactorNon[xFactorNon == "Low"][1]
## Warning in Ops.factor(xFactorNon[xFactorNon == "High"],
## xFactorNon[xFactorNon == : '>' not meaningful for factors
## [1] NA NA
xFactorOrder = factor(xRaw, ordered=TRUE, levels=c("Low", "Medium", "High", "Very High"))
xFactorOrder
## [1] High High Low Low Medium Very High Low
## Levels: Low < Medium < High < Very High
xFactorOrder[xFactorOrder == "High"] > xFactorOrder[xFactorOrder == "Low"][1]
## [1] TRUE TRUE
# Subsets
data(mtcars)
subset(mtcars, mpg>=25)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
identical(subset(mtcars, mpg>=25), mtcars[mtcars$mpg>=25, ])
## [1] TRUE
subset(mtcars, mpg>25, select=c("mpg", "cyl", "disp"))
## mpg cyl disp
## Fiat 128 32.4 4 78.7
## Honda Civic 30.4 4 75.7
## Toyota Corolla 33.9 4 71.1
## Fiat X1-9 27.3 4 79.0
## Porsche 914-2 26.0 4 120.3
## Lotus Europa 30.4 4 95.1
# & and && (same as | and ||)
compA <- c(2, 3, 4, 1, 2, 3)
compB <- c(1, 2, 3, 4, 5, 6)
(compA > compB) & (compA + compB < 6)
## [1] TRUE TRUE FALSE FALSE FALSE FALSE
(compA > compB) | (compA + compB < 6)
## [1] TRUE TRUE TRUE TRUE FALSE FALSE
(compA > compB) && (compA + compB < 6)
## [1] TRUE
(compA > compB) || (compA + compB < 6)
## [1] TRUE
# Loops and cat()
# for (a in b) {
# do stuff
# if (exitCond) { break }
# if (nextCond) { next }
# do some more stuff
# }
for (myVal in compA*compB) {
print(paste0("myVal is: ", myVal))
if ((myVal %% 3) == 0) { cat("Divisible by 3, not happy about that\n\n"); next }
print("That is not divisible by 3")
if ((myVal %% 5) == 0) { cat("Exiting due to divisible by 5 but not divisible by 3\n\n"); break }
cat("Onwards and upwards\n\n")
}
## [1] "myVal is: 2"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 6"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 12"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 4"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 10"
## [1] "That is not divisible by 3"
## Exiting due to divisible by 5 but not divisible by 3
# args() and search()
args(plot.default)
## function (x, y = NULL, type = "p", xlim = NULL, ylim = NULL,
## log = "", main = NULL, sub = NULL, xlab = NULL, ylab = NULL,
## ann = par("ann"), axes = TRUE, frame.plot = axes, panel.first = NULL,
## panel.last = NULL, asp = NA, ...)
## NULL
search()
## [1] ".GlobalEnv" "package:stats" "package:graphics"
## [4] "package:grDevices" "package:utils" "package:datasets"
## [7] "package:methods" "Autoloads" "package:base"
# unique()
compA
## [1] 2 3 4 1 2 3
unique(compA)
## [1] 2 3 4 1
# unlist()
listA <- as.list(compA)
unlist(listA)
## [1] 2 3 4 1 2 3
identical(compA, unlist(listA))
## [1] TRUE
# sort()
sort(mtcars$mpg)
## [1] 10.4 10.4 13.3 14.3 14.7 15.0 15.2 15.2 15.5 15.8 16.4 17.3 17.8 18.1
## [15] 18.7 19.2 19.2 19.7 21.0 21.0 21.4 21.4 21.5 22.8 22.8 24.4 26.0 27.3
## [29] 30.4 30.4 32.4 33.9
sort(mtcars$mpg, decreasing=TRUE)
## [1] 33.9 32.4 30.4 30.4 27.3 26.0 24.4 22.8 22.8 21.5 21.4 21.4 21.0 21.0
## [15] 19.7 19.2 19.2 18.7 18.1 17.8 17.3 16.4 15.8 15.5 15.2 15.2 15.0 14.7
## [29] 14.3 13.3 10.4 10.4
# rep()
rep(1:6, times=2) # 1:6 followed by 1:6
## [1] 1 2 3 4 5 6 1 2 3 4 5 6
rep(1:6, each=2) # 1 1 2 2 3 3 4 4 5 5 6 6
## [1] 1 1 2 2 3 3 4 4 5 5 6 6
rep(1:6, times=2, each=3) # 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 repeated twice (each comes first)
## [1] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6
## [36] 6
rep(1:6, times=6:1) # 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
## [1] 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
# append()
myWords <- c("The", "cat", "in", "the", "hat")
paste(append(myWords, c("is", "fun", "to", "read")), collapse=" ")
## [1] "The cat in the hat is fun to read"
paste(append(myWords, "funny", 4), collapse=" ")
## [1] "The cat in the funny hat"
# grep("//1")
sampMsg <- "This is from myname@subdomain.mydomain.com again"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\1", sampMsg)
## [1] "This is from myname@"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\2", sampMsg)
## [1] "subdomain.mydomain.com"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\3", sampMsg)
## [1] " again"
# rev()
compA
## [1] 2 3 4 1 2 3
rev(compA)
## [1] 3 2 1 4 3 2
Below is some sample code showing examples for the apply statements:
# lapply
args(lapply)
## function (X, FUN, ...)
## NULL
lapply(1:5, FUN=sqrt)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 1.414214
##
## [[3]]
## [1] 1.732051
##
## [[4]]
## [1] 2
##
## [[5]]
## [1] 2.236068
lapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## x y pow
## 4 3 64
##
## [[5]]
## x y pow
## 5 3 125
lapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# sapply (defaults to returning a named vector/array if possible; is lapply otherwise)
args(sapply)
## function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
## NULL
args(simplify2array)
## function (x, higher = TRUE)
## NULL
sapply(1:5, FUN=sqrt)
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
sapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
sapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# vapply (tells sapply exactly what should be returned; errors out otherwise)
args(vapply)
## function (X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)
## NULL
vapply(1:5, FUN=sqrt, FUN.VALUE=numeric(1))
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
vapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, FUN.VALUE=numeric(3), y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
Below is some sample code for handing dates and times in R:
Sys.Date()
## [1] "2017-04-16"
Sys.time()
## [1] "2017-04-16 10:30:00 CDT"
args(strptime)
## function (x, format, tz = "")
## NULL
rightNow <- as.POSIXct(Sys.time())
format(rightNow, "%Y**%M-%d %H hours and %M minutes", usetz=TRUE)
## [1] "2017**30-16 10 hours and 30 minutes CDT"
lastChristmasNoon <- as.POSIXct("2015-12-25 12:00:00", format="%Y-%m-%d %X")
rightNow - lastChristmasNoon
## Time difference of 477.8958 days
nextUMHomeGame <- as.POSIXct("16/SEP/3 12:00:00", format="%y/%b/%d %H:%M:%S", tz="America/Detroit")
nextUMHomeGame - rightNow
## Time difference of -224.9792 days
# Time zones available in R
OlsonNames()
## [1] "Africa/Abidjan" "Africa/Accra"
## [3] "Africa/Addis_Ababa" "Africa/Algiers"
## [5] "Africa/Asmara" "Africa/Asmera"
## [7] "Africa/Bamako" "Africa/Bangui"
## [9] "Africa/Banjul" "Africa/Bissau"
## [11] "Africa/Blantyre" "Africa/Brazzaville"
## [13] "Africa/Bujumbura" "Africa/Cairo"
## [15] "Africa/Casablanca" "Africa/Ceuta"
## [17] "Africa/Conakry" "Africa/Dakar"
## [19] "Africa/Dar_es_Salaam" "Africa/Djibouti"
## [21] "Africa/Douala" "Africa/El_Aaiun"
## [23] "Africa/Freetown" "Africa/Gaborone"
## [25] "Africa/Harare" "Africa/Johannesburg"
## [27] "Africa/Juba" "Africa/Kampala"
## [29] "Africa/Khartoum" "Africa/Kigali"
## [31] "Africa/Kinshasa" "Africa/Lagos"
## [33] "Africa/Libreville" "Africa/Lome"
## [35] "Africa/Luanda" "Africa/Lubumbashi"
## [37] "Africa/Lusaka" "Africa/Malabo"
## [39] "Africa/Maputo" "Africa/Maseru"
## [41] "Africa/Mbabane" "Africa/Mogadishu"
## [43] "Africa/Monrovia" "Africa/Nairobi"
## [45] "Africa/Ndjamena" "Africa/Niamey"
## [47] "Africa/Nouakchott" "Africa/Ouagadougou"
## [49] "Africa/Porto-Novo" "Africa/Sao_Tome"
## [51] "Africa/Timbuktu" "Africa/Tripoli"
## [53] "Africa/Tunis" "Africa/Windhoek"
## [55] "America/Adak" "America/Anchorage"
## [57] "America/Anguilla" "America/Antigua"
## [59] "America/Araguaina" "America/Argentina/Buenos_Aires"
## [61] "America/Argentina/Catamarca" "America/Argentina/ComodRivadavia"
## [63] "America/Argentina/Cordoba" "America/Argentina/Jujuy"
## [65] "America/Argentina/La_Rioja" "America/Argentina/Mendoza"
## [67] "America/Argentina/Rio_Gallegos" "America/Argentina/Salta"
## [69] "America/Argentina/San_Juan" "America/Argentina/San_Luis"
## [71] "America/Argentina/Tucuman" "America/Argentina/Ushuaia"
## [73] "America/Aruba" "America/Asuncion"
## [75] "America/Atikokan" "America/Atka"
## [77] "America/Bahia" "America/Bahia_Banderas"
## [79] "America/Barbados" "America/Belem"
## [81] "America/Belize" "America/Blanc-Sablon"
## [83] "America/Boa_Vista" "America/Bogota"
## [85] "America/Boise" "America/Buenos_Aires"
## [87] "America/Cambridge_Bay" "America/Campo_Grande"
## [89] "America/Cancun" "America/Caracas"
## [91] "America/Catamarca" "America/Cayenne"
## [93] "America/Cayman" "America/Chicago"
## [95] "America/Chihuahua" "America/Coral_Harbour"
## [97] "America/Cordoba" "America/Costa_Rica"
## [99] "America/Creston" "America/Cuiaba"
## [101] "America/Curacao" "America/Danmarkshavn"
## [103] "America/Dawson" "America/Dawson_Creek"
## [105] "America/Denver" "America/Detroit"
## [107] "America/Dominica" "America/Edmonton"
## [109] "America/Eirunepe" "America/El_Salvador"
## [111] "America/Ensenada" "America/Fort_Nelson"
## [113] "America/Fort_Wayne" "America/Fortaleza"
## [115] "America/Glace_Bay" "America/Godthab"
## [117] "America/Goose_Bay" "America/Grand_Turk"
## [119] "America/Grenada" "America/Guadeloupe"
## [121] "America/Guatemala" "America/Guayaquil"
## [123] "America/Guyana" "America/Halifax"
## [125] "America/Havana" "America/Hermosillo"
## [127] "America/Indiana/Indianapolis" "America/Indiana/Knox"
## [129] "America/Indiana/Marengo" "America/Indiana/Petersburg"
## [131] "America/Indiana/Tell_City" "America/Indiana/Vevay"
## [133] "America/Indiana/Vincennes" "America/Indiana/Winamac"
## [135] "America/Indianapolis" "America/Inuvik"
## [137] "America/Iqaluit" "America/Jamaica"
## [139] "America/Jujuy" "America/Juneau"
## [141] "America/Kentucky/Louisville" "America/Kentucky/Monticello"
## [143] "America/Knox_IN" "America/Kralendijk"
## [145] "America/La_Paz" "America/Lima"
## [147] "America/Los_Angeles" "America/Louisville"
## [149] "America/Lower_Princes" "America/Maceio"
## [151] "America/Managua" "America/Manaus"
## [153] "America/Marigot" "America/Martinique"
## [155] "America/Matamoros" "America/Mazatlan"
## [157] "America/Mendoza" "America/Menominee"
## [159] "America/Merida" "America/Metlakatla"
## [161] "America/Mexico_City" "America/Miquelon"
## [163] "America/Moncton" "America/Monterrey"
## [165] "America/Montevideo" "America/Montreal"
## [167] "America/Montserrat" "America/Nassau"
## [169] "America/New_York" "America/Nipigon"
## [171] "America/Nome" "America/Noronha"
## [173] "America/North_Dakota/Beulah" "America/North_Dakota/Center"
## [175] "America/North_Dakota/New_Salem" "America/Ojinaga"
## [177] "America/Panama" "America/Pangnirtung"
## [179] "America/Paramaribo" "America/Phoenix"
## [181] "America/Port-au-Prince" "America/Port_of_Spain"
## [183] "America/Porto_Acre" "America/Porto_Velho"
## [185] "America/Puerto_Rico" "America/Rainy_River"
## [187] "America/Rankin_Inlet" "America/Recife"
## [189] "America/Regina" "America/Resolute"
## [191] "America/Rio_Branco" "America/Rosario"
## [193] "America/Santa_Isabel" "America/Santarem"
## [195] "America/Santiago" "America/Santo_Domingo"
## [197] "America/Sao_Paulo" "America/Scoresbysund"
## [199] "America/Shiprock" "America/Sitka"
## [201] "America/St_Barthelemy" "America/St_Johns"
## [203] "America/St_Kitts" "America/St_Lucia"
## [205] "America/St_Thomas" "America/St_Vincent"
## [207] "America/Swift_Current" "America/Tegucigalpa"
## [209] "America/Thule" "America/Thunder_Bay"
## [211] "America/Tijuana" "America/Toronto"
## [213] "America/Tortola" "America/Vancouver"
## [215] "America/Virgin" "America/Whitehorse"
## [217] "America/Winnipeg" "America/Yakutat"
## [219] "America/Yellowknife" "Antarctica/Casey"
## [221] "Antarctica/Davis" "Antarctica/DumontDUrville"
## [223] "Antarctica/Macquarie" "Antarctica/Mawson"
## [225] "Antarctica/McMurdo" "Antarctica/Palmer"
## [227] "Antarctica/Rothera" "Antarctica/South_Pole"
## [229] "Antarctica/Syowa" "Antarctica/Troll"
## [231] "Antarctica/Vostok" "Arctic/Longyearbyen"
## [233] "Asia/Aden" "Asia/Almaty"
## [235] "Asia/Amman" "Asia/Anadyr"
## [237] "Asia/Aqtau" "Asia/Aqtobe"
## [239] "Asia/Ashgabat" "Asia/Ashkhabad"
## [241] "Asia/Baghdad" "Asia/Bahrain"
## [243] "Asia/Baku" "Asia/Bangkok"
## [245] "Asia/Beirut" "Asia/Bishkek"
## [247] "Asia/Brunei" "Asia/Calcutta"
## [249] "Asia/Chita" "Asia/Choibalsan"
## [251] "Asia/Chongqing" "Asia/Chungking"
## [253] "Asia/Colombo" "Asia/Dacca"
## [255] "Asia/Damascus" "Asia/Dhaka"
## [257] "Asia/Dili" "Asia/Dubai"
## [259] "Asia/Dushanbe" "Asia/Gaza"
## [261] "Asia/Harbin" "Asia/Hebron"
## [263] "Asia/Ho_Chi_Minh" "Asia/Hong_Kong"
## [265] "Asia/Hovd" "Asia/Irkutsk"
## [267] "Asia/Istanbul" "Asia/Jakarta"
## [269] "Asia/Jayapura" "Asia/Jerusalem"
## [271] "Asia/Kabul" "Asia/Kamchatka"
## [273] "Asia/Karachi" "Asia/Kashgar"
## [275] "Asia/Kathmandu" "Asia/Katmandu"
## [277] "Asia/Khandyga" "Asia/Kolkata"
## [279] "Asia/Krasnoyarsk" "Asia/Kuala_Lumpur"
## [281] "Asia/Kuching" "Asia/Kuwait"
## [283] "Asia/Macao" "Asia/Macau"
## [285] "Asia/Magadan" "Asia/Makassar"
## [287] "Asia/Manila" "Asia/Muscat"
## [289] "Asia/Nicosia" "Asia/Novokuznetsk"
## [291] "Asia/Novosibirsk" "Asia/Omsk"
## [293] "Asia/Oral" "Asia/Phnom_Penh"
## [295] "Asia/Pontianak" "Asia/Pyongyang"
## [297] "Asia/Qatar" "Asia/Qyzylorda"
## [299] "Asia/Rangoon" "Asia/Riyadh"
## [301] "Asia/Saigon" "Asia/Sakhalin"
## [303] "Asia/Samarkand" "Asia/Seoul"
## [305] "Asia/Shanghai" "Asia/Singapore"
## [307] "Asia/Srednekolymsk" "Asia/Taipei"
## [309] "Asia/Tashkent" "Asia/Tbilisi"
## [311] "Asia/Tehran" "Asia/Tel_Aviv"
## [313] "Asia/Thimbu" "Asia/Thimphu"
## [315] "Asia/Tokyo" "Asia/Ujung_Pandang"
## [317] "Asia/Ulaanbaatar" "Asia/Ulan_Bator"
## [319] "Asia/Urumqi" "Asia/Ust-Nera"
## [321] "Asia/Vientiane" "Asia/Vladivostok"
## [323] "Asia/Yakutsk" "Asia/Yekaterinburg"
## [325] "Asia/Yerevan" "Atlantic/Azores"
## [327] "Atlantic/Bermuda" "Atlantic/Canary"
## [329] "Atlantic/Cape_Verde" "Atlantic/Faeroe"
## [331] "Atlantic/Faroe" "Atlantic/Jan_Mayen"
## [333] "Atlantic/Madeira" "Atlantic/Reykjavik"
## [335] "Atlantic/South_Georgia" "Atlantic/St_Helena"
## [337] "Atlantic/Stanley" "Australia/ACT"
## [339] "Australia/Adelaide" "Australia/Brisbane"
## [341] "Australia/Broken_Hill" "Australia/Canberra"
## [343] "Australia/Currie" "Australia/Darwin"
## [345] "Australia/Eucla" "Australia/Hobart"
## [347] "Australia/LHI" "Australia/Lindeman"
## [349] "Australia/Lord_Howe" "Australia/Melbourne"
## [351] "Australia/North" "Australia/NSW"
## [353] "Australia/Perth" "Australia/Queensland"
## [355] "Australia/South" "Australia/Sydney"
## [357] "Australia/Tasmania" "Australia/Victoria"
## [359] "Australia/West" "Australia/Yancowinna"
## [361] "Brazil/Acre" "Brazil/DeNoronha"
## [363] "Brazil/East" "Brazil/West"
## [365] "Canada/Atlantic" "Canada/Central"
## [367] "Canada/East-Saskatchewan" "Canada/Eastern"
## [369] "Canada/Mountain" "Canada/Newfoundland"
## [371] "Canada/Pacific" "Canada/Saskatchewan"
## [373] "Canada/Yukon" "CET"
## [375] "Chile/Continental" "Chile/EasterIsland"
## [377] "CST6CDT" "Cuba"
## [379] "EET" "Egypt"
## [381] "Eire" "EST"
## [383] "EST5EDT" "Etc/GMT"
## [385] "Etc/GMT-0" "Etc/GMT-1"
## [387] "Etc/GMT-10" "Etc/GMT-11"
## [389] "Etc/GMT-12" "Etc/GMT-13"
## [391] "Etc/GMT-14" "Etc/GMT-2"
## [393] "Etc/GMT-3" "Etc/GMT-4"
## [395] "Etc/GMT-5" "Etc/GMT-6"
## [397] "Etc/GMT-7" "Etc/GMT-8"
## [399] "Etc/GMT-9" "Etc/GMT+0"
## [401] "Etc/GMT+1" "Etc/GMT+10"
## [403] "Etc/GMT+11" "Etc/GMT+12"
## [405] "Etc/GMT+2" "Etc/GMT+3"
## [407] "Etc/GMT+4" "Etc/GMT+5"
## [409] "Etc/GMT+6" "Etc/GMT+7"
## [411] "Etc/GMT+8" "Etc/GMT+9"
## [413] "Etc/GMT0" "Etc/Greenwich"
## [415] "Etc/UCT" "Etc/Universal"
## [417] "Etc/UTC" "Etc/Zulu"
## [419] "Europe/Amsterdam" "Europe/Andorra"
## [421] "Europe/Athens" "Europe/Belfast"
## [423] "Europe/Belgrade" "Europe/Berlin"
## [425] "Europe/Bratislava" "Europe/Brussels"
## [427] "Europe/Bucharest" "Europe/Budapest"
## [429] "Europe/Busingen" "Europe/Chisinau"
## [431] "Europe/Copenhagen" "Europe/Dublin"
## [433] "Europe/Gibraltar" "Europe/Guernsey"
## [435] "Europe/Helsinki" "Europe/Isle_of_Man"
## [437] "Europe/Istanbul" "Europe/Jersey"
## [439] "Europe/Kaliningrad" "Europe/Kiev"
## [441] "Europe/Lisbon" "Europe/Ljubljana"
## [443] "Europe/London" "Europe/Luxembourg"
## [445] "Europe/Madrid" "Europe/Malta"
## [447] "Europe/Mariehamn" "Europe/Minsk"
## [449] "Europe/Monaco" "Europe/Moscow"
## [451] "Europe/Nicosia" "Europe/Oslo"
## [453] "Europe/Paris" "Europe/Podgorica"
## [455] "Europe/Prague" "Europe/Riga"
## [457] "Europe/Rome" "Europe/Samara"
## [459] "Europe/San_Marino" "Europe/Sarajevo"
## [461] "Europe/Simferopol" "Europe/Skopje"
## [463] "Europe/Sofia" "Europe/Stockholm"
## [465] "Europe/Tallinn" "Europe/Tirane"
## [467] "Europe/Tiraspol" "Europe/Uzhgorod"
## [469] "Europe/Vaduz" "Europe/Vatican"
## [471] "Europe/Vienna" "Europe/Vilnius"
## [473] "Europe/Volgograd" "Europe/Warsaw"
## [475] "Europe/Zagreb" "Europe/Zaporozhye"
## [477] "Europe/Zurich" "GB"
## [479] "GB-Eire" "GMT"
## [481] "GMT-0" "GMT+0"
## [483] "GMT0" "Greenwich"
## [485] "Hongkong" "HST"
## [487] "Iceland" "Indian/Antananarivo"
## [489] "Indian/Chagos" "Indian/Christmas"
## [491] "Indian/Cocos" "Indian/Comoro"
## [493] "Indian/Kerguelen" "Indian/Mahe"
## [495] "Indian/Maldives" "Indian/Mauritius"
## [497] "Indian/Mayotte" "Indian/Reunion"
## [499] "Iran" "Israel"
## [501] "Jamaica" "Japan"
## [503] "Kwajalein" "Libya"
## [505] "MET" "Mexico/BajaNorte"
## [507] "Mexico/BajaSur" "Mexico/General"
## [509] "MST" "MST7MDT"
## [511] "Navajo" "NZ"
## [513] "NZ-CHAT" "Pacific/Apia"
## [515] "Pacific/Auckland" "Pacific/Bougainville"
## [517] "Pacific/Chatham" "Pacific/Chuuk"
## [519] "Pacific/Easter" "Pacific/Efate"
## [521] "Pacific/Enderbury" "Pacific/Fakaofo"
## [523] "Pacific/Fiji" "Pacific/Funafuti"
## [525] "Pacific/Galapagos" "Pacific/Gambier"
## [527] "Pacific/Guadalcanal" "Pacific/Guam"
## [529] "Pacific/Honolulu" "Pacific/Johnston"
## [531] "Pacific/Kiritimati" "Pacific/Kosrae"
## [533] "Pacific/Kwajalein" "Pacific/Majuro"
## [535] "Pacific/Marquesas" "Pacific/Midway"
## [537] "Pacific/Nauru" "Pacific/Niue"
## [539] "Pacific/Norfolk" "Pacific/Noumea"
## [541] "Pacific/Pago_Pago" "Pacific/Palau"
## [543] "Pacific/Pitcairn" "Pacific/Pohnpei"
## [545] "Pacific/Ponape" "Pacific/Port_Moresby"
## [547] "Pacific/Rarotonga" "Pacific/Saipan"
## [549] "Pacific/Samoa" "Pacific/Tahiti"
## [551] "Pacific/Tarawa" "Pacific/Tongatapu"
## [553] "Pacific/Truk" "Pacific/Wake"
## [555] "Pacific/Wallis" "Pacific/Yap"
## [557] "Poland" "Portugal"
## [559] "PRC" "PST8PDT"
## [561] "ROC" "ROK"
## [563] "Singapore" "Turkey"
## [565] "UCT" "Universal"
## [567] "US/Alaska" "US/Aleutian"
## [569] "US/Arizona" "US/Central"
## [571] "US/East-Indiana" "US/Eastern"
## [573] "US/Hawaii" "US/Indiana-Starke"
## [575] "US/Michigan" "US/Mountain"
## [577] "US/Pacific" "US/Pacific-New"
## [579] "US/Samoa" "UTC"
## [581] "VERSION" "W-SU"
## [583] "WET" "Zulu"
# From ?strptime (excerpted)
#
# ** General formats **
# %c Date and time. Locale-specific on output, "%a %b %e %H:%M:%S %Y" on input.
# %F Equivalent to %Y-%m-%d (the ISO 8601 date format).
# %T Equivalent to %H:%M:%S.
# %D Date format such as %m/%d/%y: the C99 standard says it should be that exact format
# %x Date. Locale-specific on output, "%y/%m/%d" on input.
# %X Time. Locale-specific on output, "%H:%M:%S" on input.
#
# ** Key Components **
# %y Year without century (00-99). On input, values 00 to 68 are prefixed by 20 and 69 to 99 by 19
# %Y Year with century
# %m Month as decimal number (01-12).
# %b Abbreviated month name in the current locale on this platform.
# %B Full month name in the current locale.
# %d Day of the month as decimal number (01-31).
# %e Day of the month as decimal number (1-31), with a leading space for a single-digit number.
# %a Abbreviated weekday name in the current locale on this platform.
# %A Full weekday name in the current locale.
# %H Hours as decimal number (00-23)
# %I Hours as decimal number (01-12)
# %M Minute as decimal number (00-59).
# %S Second as integer (00-61), allowing for up to two leap-seconds (but POSIX-compliant implementations will ignore leap seconds).
#
# ** Additional Options **
# %C Century (00-99): the integer part of the year divided by 100.
#
# %g The last two digits of the week-based year (see %V). (Accepted but ignored on input.)
# %G The week-based year (see %V) as a decimal number. (Accepted but ignored on input.)
#
# %h Equivalent to %b.
#
# %j Day of year as decimal number (001-366).
#
# %n Newline on output, arbitrary whitespace on input.
#
# %p AM/PM indicator in the locale. Used in conjunction with %I and not with %H. An empty string in some locales (and the behaviour is undefined if used for input in such a locale). Some platforms accept %P for output, which uses a lower-case version: others will output P.
#
# %r The 12-hour clock time (using the locale's AM or PM). Only defined in some locales.
#
# %R Equivalent to %H:%M.
#
# %t Tab on output, arbitrary whitespace on input.
#
# %u Weekday as a decimal number (1-7, Monday is 1).
#
# %U Week of the year as decimal number (00-53) using Sunday as the first day 1 of the week (and typically with the first Sunday of the year as day 1 of week 1). The US convention.
#
# %V Week of the year as decimal number (01-53) as defined in ISO 8601. If the week (starting on Monday) containing 1 January has four or more days in the new year, then it is considered week 1. Otherwise, it is the last week of the previous year, and the next week is week 1. (Accepted but ignored on input.)
#
# %w Weekday as decimal number (0-6, Sunday is 0).
#
# %W Week of the year as decimal number (00-53) using Monday as the first day of week (and typically with the first Monday of the year as day 1 of week 1). The UK convention.
#
# For input, only years 0:9999 are accepted.
#
# %z Signed offset in hours and minutes from UTC, so -0800 is 8 hours behind UTC. Values up to +1400 are accepted as from R 3.1.1: previous versions only accepted up to +1200. (Standard only for output.)
#
# %Z (Output only.) Time zone abbreviation as a character string (empty if not available). This may not be reliable when a time zone has changed abbreviations over the years.
Additionally, code from several practice examples is added:
set.seed(1608221310)
me <- 89
other_199 <- round(rnorm(199, mean=75.45, sd=11.03), 0)
mean(other_199)
## [1] 75.17588
sd(other_199)
## [1] 11.37711
desMeans <- c(72.275, 76.24, 74.5, 77.695)
desSD <- c(12.31, 11.22, 12.5, 12.53)
prevData <- c(rnorm(200, mean=72.275, sd=12.31),
rnorm(200, mean=76.24, sd=11.22),
rnorm(200, mean=74.5, sd=12.5),
rnorm(200, mean=77.695, sd=12.53)
)
previous_4 <- matrix(data=prevData, ncol=4)
curMeans <- apply(previous_4, 2, FUN=mean)
curSD <- apply(previous_4, 2, FUN=sd)
previous_4 <- t(apply(previous_4, 1, FUN=function(x) { desMeans + (desSD / curSD) * (x - curMeans) } ))
apply(round(previous_4, 0), 2, FUN=mean)
## [1] 72.285 76.245 74.505 77.665
apply(round(previous_4, 0), 2, FUN=sd)
## [1] 12.35097 11.19202 12.49643 12.51744
previous_4 <- round(previous_4, 0)
# Merge me and other_199: my_class
my_class <- c(me, other_199)
# cbind() my_class and previous_4: last_5
last_5 <- cbind(my_class, previous_4)
# Name last_5 appropriately
nms <- paste0("year_", 1:5)
colnames(last_5) <- nms
# Build histogram of my_class
hist(my_class)
# Generate summary of last_5
summary(last_5)
## year_1 year_2 year_3 year_4
## Min. : 46.00 Min. : 43.00 Min. : 38.00 Min. : 42.00
## 1st Qu.: 68.00 1st Qu.: 63.75 1st Qu.: 69.00 1st Qu.: 65.75
## Median : 75.50 Median : 73.00 Median : 76.50 Median : 74.00
## Mean : 75.25 Mean : 72.28 Mean : 76.25 Mean : 74.50
## 3rd Qu.: 83.25 3rd Qu.: 81.00 3rd Qu.: 84.25 3rd Qu.: 82.25
## Max. :108.00 Max. :108.00 Max. :102.00 Max. :113.00
## year_5
## Min. : 38.00
## 1st Qu.: 71.00
## Median : 78.00
## Mean : 77.67
## 3rd Qu.: 86.00
## Max. :117.00
# Build boxplot of last_5
boxplot(last_5)
# How many grades in your class are higher than 75?
sum(my_class > 75)
## [1] 100
# How many students in your class scored strictly higher than you?
sum(my_class > me)
## [1] 17
# What's the proportion of grades below or equal to 64 in the last 5 years?
mean(last_5 <= 64)
## [1] 0.191
# Is your grade greater than 87 and smaller than or equal to 89?
me > 87 & me <= 89
## [1] TRUE
# Which grades in your class are below 60 or above 90?
my_class < 60 | my_class > 90
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [23] TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [34] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [45] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [56] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [78] FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [89] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE
## [100] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## [133] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [155] FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [166] FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [199] FALSE FALSE
# What's the proportion of grades in your class that is average?
mean(my_class >= 70 & my_class <= 85)
## [1] 0.525
# How many students in the last 5 years had a grade of 80 or 90?
sum(last_5 %in% c(80, 90))
## [1] 44
# Define n_smart
n_smart <- sum(my_class >= 80)
# Code the if-else construct
if (n_smart > 50) {
print("smart class")
} else {
print("rather average")
}
## [1] "smart class"
# Define prop_less
prop_less <- mean(my_class < me)
# Code the control construct
if (prop_less > 0.9) {
print("you're among the best 10 percent")
} else if (prop_less > 0.8) {
print("you're among the best 20 percent")
} else {
print("need more analysis")
}
## [1] "you're among the best 20 percent"
# Embedded control structure: fix the error
if (mean(my_class) < 75) {
if (mean(my_class) > me) {
print("average year, but still smarter than me")
} else {
print("average year, but I'm not that bad")
}
} else {
if (mean(my_class) > me) {
print("smart year, even smarter than me")
} else {
print("smart year, but I am smarter")
}
}
## [1] "smart year, but I am smarter"
# Create top_grades
top_grades <- my_class[my_class >= 85]
# Create worst_grades
worst_grades <- my_class[my_class < 65]
# Write conditional statement
if (length(top_grades) > length(worst_grades)) { print("top grades prevail") }
## [1] "top grades prevail"
Hadley and Charlotte Wickham led a course on writing functions in R. Broadly, the course includes advice on when/how to use functions, as well as specific advice about commands available through library(purrr).
Key pieces of advice include:
John Chambers gave a few useful slogans about functions:
Each function has three components:
Only the LAST evaluated expression is returned. The use of return() is recommended only for early-returns in a special case (for example, when a break() will be called).
Further, functions can be written anonymously on the command line, such as (function (x) {x + 1}) (1:5). A function should only depend on arguments passed to it, not variables from a parent enviornment. Every time the function is called, it receives a clean working environment. Once it finishes, its variables are no longer available unless they were returned (either by default as the last operation, or by way of return()):
# Components of a function
args(rnorm)
## function (n, mean = 0, sd = 1)
## NULL
formals(rnorm)
## $n
##
##
## $mean
## [1] 0
##
## $sd
## [1] 1
body(rnorm)
## .Call(C_rnorm, n, mean, sd)
environment(rnorm)
## <environment: namespace:stats>
# What is passed back
funDummy <- function(x) {
if (x <= 2) {
print("That is too small")
return(3) # This ends the function by convention
}
ceiling(x) # This is the defaulted return() value if nothing happened to prevent the code getting here
}
funDummy(1)
## [1] "That is too small"
## [1] 3
funDummy(5)
## [1] 5
# Anonymous functions
(function (x) {x + 1}) (1:5)
## [1] 2 3 4 5 6
The course includes some insightful discussion of vectors. As it happens, lists and data frames are just special collections of vectors in R. Each column of a data frame is a vector, while each element of a list is either 1) an embedded data frame (which is eventually a vector by way of columns), 2) an embedded list (which is eventually a vector by way of recursion), or 3) an actual vector.
The atomic vectors are of types logical, integer, character, and double; complex and raw are rarer types that are also available. Lists are just recursive vectors, which is to say that lists can contain other lists and can be hetergeneous. To explore vectors, you have:
Note that NULL is the absence of a vector and has length 0. NA is the absence of an element in the vector and has length 1. All math operations with NA return NA; for example NA == NA will return NA.
There are some good tips on extracting element from a list:
# Data types
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
typeof(mtcars) # n.b. that this is technically a "list"
## [1] "list"
length(mtcars)
## [1] 11
# NULL and NA
length(NULL)
## [1] 0
typeof(NULL)
## [1] "NULL"
length(NA)
## [1] 1
typeof(NA)
## [1] "logical"
NULL == NULL
## logical(0)
NULL == NA
## logical(0)
NA == NA
## [1] NA
is.null(NULL)
## [1] TRUE
is.null(NA)
## [1] FALSE
is.na(NULL)
## Warning in is.na(NULL): is.na() applied to non-(list or vector) of type
## 'NULL'
## logical(0)
is.na(NA)
## [1] TRUE
# Extraction
mtcars[["mpg"]][1:5]
## [1] 21.0 21.0 22.8 21.4 18.7
mtcars[[2]][1:5]
## [1] 6 6 4 6 8
mtcars$hp[1:5]
## [1] 110 110 93 110 175
# Relevant lengths
seq_along(mtcars)
## [1] 1 2 3 4 5 6 7 8 9 10 11
x <- data.frame()
seq_along(x)
## integer(0)
length(seq_along(x))
## [1] 0
foo <- function(x) { for (eachCol in seq_along(x)) { print(typeof(x[[eachCol]])) }}
foo(mtcars)
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
foo(x) # Note that this does nothing!
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
foo(airquality)
## [1] "integer"
## [1] "integer"
## [1] "double"
## [1] "integer"
## [1] "integer"
## [1] "integer"
# Range command
mpgRange <- range(mtcars$mpg)
mpgRange
## [1] 10.4 33.9
mpgScale <- (mtcars$mpg - mpgRange[1]) / (mpgRange[2] - mpgRange[1])
summary(mpgScale)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2138 0.3745 0.4124 0.5277 1.0000
The typical arguments in a function use a consistent, simple naming function:
Data arguments should come before detail arguments, and detail arguments should be given reasonable default values. See for example rnorm(n, mean=0, sd=1). The number requested (n) must be specified, but defaults are available for the details (mean and standard deviation).
Functions can be passed as arguments to other functions, which is at the core of functional programming. For example:
do_math <- function(x, fun) { fun(x) }
do_math(1:10, fun=mean)
## [1] 5.5
do_math(1:10, fun=sd)
## [1] 3.02765
The library(purrr) takes advantage of this, and in a type-consistent manner. There are functions for:
The general arguments are .x (a list or an atomic vector) and .f which can be either a function, an anonymous function (formula with ~), or an extractor .x[[.f]]. For example:
library(purrr)
## Warning: package 'purrr' was built under R version 3.2.5
library(RColorBrewer) # Need to have in non-cached chunk for later
data(mtcars)
# Create output as a list
map(.x=mtcars, .f=sum)
## $mpg
## [1] 642.9
##
## $cyl
## [1] 198
##
## $disp
## [1] 7383.1
##
## $hp
## [1] 4694
##
## $drat
## [1] 115.09
##
## $wt
## [1] 102.952
##
## $qsec
## [1] 571.16
##
## $vs
## [1] 14
##
## $am
## [1] 13
##
## $gear
## [1] 118
##
## $carb
## [1] 90
# Create same output as a double
map_dbl(.x=mtcars, .f=sum)
## mpg cyl disp hp drat wt qsec vs
## 642.900 198.000 7383.100 4694.000 115.090 102.952 571.160 14.000
## am gear carb
## 13.000 118.000 90.000
# Create same output as integer
# map_int(.x=mtcars, .f=sum) . . . this would bomb since it is not actually an integere
map_int(.x=mtcars, .f=function(x) { as.integer(round(sum(x), 0)) } )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Same thing but using an anonymous function with ~ and .
map_int(.x=mtcars, .f = ~ as.integer(round(sum(.), 0)) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Create a boolean vector
map_lgl(.x=mtcars, .f = ~ ifelse(sum(.) > 200, TRUE, FALSE) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
# Create a character vector
map_chr(.x=mtcars, .f = ~ ifelse(sum(.) > 200, "Large", "Not So Large") )
## mpg cyl disp hp drat
## "Large" "Not So Large" "Large" "Large" "Not So Large"
## wt qsec vs am gear
## "Not So Large" "Large" "Not So Large" "Not So Large" "Not So Large"
## carb
## "Not So Large"
# Use the extractor [pulls the first row]
map_dbl(.x=mtcars, .f=1)
## mpg cyl disp hp drat wt qsec vs am gear
## 21.00 6.00 160.00 110.00 3.90 2.62 16.46 0.00 1.00 4.00
## carb
## 4.00
# Example from help file using chaining
mtcars %>%
split(.$cyl) %>%
map(~ lm(mpg ~ wt, data = .x)) %>%
map(summary) %>%
map_dbl("r.squared")
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Using sapply
sapply(split(mtcars, mtcars$cyl), FUN=function(.x) { summary(lm(mpg ~ wt, data=.x))$r.squared } )
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Use the extractor from a list
cylSplit <- split(mtcars, mtcars$cyl)
map(cylSplit, "mpg")
## $`4`
## [1] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26.0 30.4 21.4
##
## $`6`
## [1] 21.0 21.0 21.4 18.1 19.2 17.8 19.7
##
## $`8`
## [1] 18.7 14.3 16.4 17.3 15.2 10.4 10.4 14.7 15.5 15.2 13.3 19.2 15.8 15.0
map(cylSplit, "cyl")
## $`4`
## [1] 4 4 4 4 4 4 4 4 4 4 4
##
## $`6`
## [1] 6 6 6 6 6 6 6
##
## $`8`
## [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8
The purrr library has several additional interesting functions:
Some example code includes:
library(purrr) # Called again for clarity; all these key functions belong to purrr
# safely(.f, otherwise = NULL, quiet = TRUE)
safe_log10 <- safely(log10)
map(list(0, 1, 10, "a"), .f=safe_log10)
## [[1]]
## [[1]]$result
## [1] -Inf
##
## [[1]]$error
## NULL
##
##
## [[2]]
## [[2]]$result
## [1] 0
##
## [[2]]$error
## NULL
##
##
## [[3]]
## [[3]]$result
## [1] 1
##
## [[3]]$error
## NULL
##
##
## [[4]]
## [[4]]$result
## NULL
##
## [[4]]$error
## <simpleError in .f(...): non-numeric argument to mathematical function>
# possibly(.f, otherwise, quiet = TRUE)
poss_log10 <- possibly(log10, otherwise=NaN)
map_dbl(list(0, 1, 10, "a"), .f=poss_log10)
## [1] -Inf 0 1 NaN
# transpose() - note that this can become masked by data.table::transpose() so be careful
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))
## $result
## $result[[1]]
## [1] -Inf
##
## $result[[2]]
## [1] 0
##
## $result[[3]]
## [1] 1
##
## $result[[4]]
## NULL
##
##
## $error
## $error[[1]]
## NULL
##
## $error[[2]]
## NULL
##
## $error[[3]]
## NULL
##
## $error[[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result
## [[1]]
## [1] -Inf
##
## [[2]]
## [1] 0
##
## [[3]]
## [1] 1
##
## [[4]]
## NULL
unlist(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result)
## [1] -Inf 0 1
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
map_lgl(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error, is.null)
## [1] TRUE TRUE TRUE FALSE
# map2(.x, .y, .f)
map2(list(5, 10, 20), list(1, 2, 3), .f=rnorm) # rnorm(5, 1), rnorm(10, 2), and rnorm(20, 3)
## [[1]]
## [1] 0.41176421 2.00652288 0.06152025 0.46963873 1.15436157
##
## [[2]]
## [1] 0.006821057 2.902712636 1.436150816 1.377836302 2.625075832
## [6] 0.680797806 0.313499192 0.718062969 2.820989906 3.134207742
##
## [[3]]
## [1] 3.3716474 2.9393673 1.8648940 3.2343343 2.1849894 2.0697179 1.0872014
## [8] 3.4970403 3.5769694 3.0999340 1.2033341 0.9839011 2.9820314 1.7116383
## [15] 0.8779558 1.6990118 2.5914013 2.3587803 3.7460957 1.2980312
# pmap(.l, .f)
pmap(list(n=list(5, 10, 20), mean=list(1, 5, 10), sd=list(0.1, 0.5, 0.1)), rnorm)
## [[1]]
## [1] 1.0151570 1.1573287 1.0628581 0.8805484 0.9418430
##
## [[2]]
## [1] 5.032920 4.689799 5.423525 5.265610 4.727383 5.252325 5.166292
## [8] 4.861745 5.135408 4.106679
##
## [[3]]
## [1] 9.854138 10.090939 10.045554 9.970755 10.092487 9.769531 10.140064
## [8] 9.834716 10.196817 10.047367 10.054093 10.006439 10.142002 10.092259
## [15] 10.222459 10.082440 10.067818 9.993884 10.078380 9.936942
# invoke_map(.f, .x, ...)
invoke_map(list(rnorm, runif, rexp), n=5)
## [[1]]
## [1] -0.96707137 0.08207476 1.39498168 0.60287972 -0.15130461
##
## [[2]]
## [1] 0.01087442 0.02980483 0.81443586 0.88438198 0.67976034
##
## [[3]]
## [1] 0.2646751 1.3233260 1.1079261 1.3504952 0.6795524
# walk() is for the side effects of a function
x <- list(1, "\n\ta\n", 3)
x %>% walk(cat)
## 1
## a
## 3
# Chaining is available by way of the %>% operator
pretty_titles <- c("N(0, 1)", "Uniform(0, 1)", "Exponential (rate=1)")
set.seed(1607120947)
x <- invoke_map(list(rnorm, runif, rexp), n=5000)
foo <- function(x) { map(x, .f=summary) }
par(mfrow=c(1, 3))
pwalk(list(x=x, main=pretty_titles), .f=hist, xlab="", col="light blue") %>% map(.f=foo)
## $x
## $x[[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.711000 -0.637800 -0.000217 0.006543 0.671800 3.633000
##
## $x[[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0001241 0.2518000 0.5012000 0.5028000 0.7566000 0.9999000
##
## $x[[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00001 0.29140 0.68340 0.98260 1.37900 8.46300
##
##
## $main
## $main[[1]]
## Length Class Mode
## 1 character character
##
## $main[[2]]
## Length Class Mode
## 1 character character
##
## $main[[3]]
## Length Class Mode
## 1 character character
par(mfrow=c(1, 1))
There are two potentially desirable behaviors with functions:
As a best practice, R functions that will be used for programming (as opposed to interactive command line work) should be written in a robust manner. Three standard problems should be avoided/mitigated:
There are several methods available for throwing errors within an R function:
One example that commonly creates surprises is the [,] operator for extraction. Adding [ , , drop=FALSE] ensures that you will still have what you passed (e.g., a matrix or data frame) rather than conversion of a chunk of data to a vector.
Another common source of error is sapply() which will return a vector when it can and a list otherwise. The map() and map_typ() functions in purrr are designed to be type-stable; if the output is not as expected, they will error out.
Non-standard evaluations take advantage of the existence of something else (e.g., a variable in the parent environment that has not been passed). This can cause confusion and improper results.
Pure functions have the key properties that 1) their output depends only on their inputs, and 2) they do not impact the outside world other than by way of their return value. Specifically, the function should not depend on how the user has configured their global options as shown in options(), nor should it modify those options() settings upon return of control to the parent environment.
A few examples are shown below:
# Throwing errors to stop a function (cannot actually run these!)
# stopifnot(FALSE)
# if (FALSE) { stop("Error: ", call.=FALSE) }
# if (FALSE) { stop("Error: This condition needed to be set as TRUE", call.=FALSE) }
# Behavior of [,] and [,,drop=FALSE]
mtxTest <- matrix(data=1:9, nrow=3, byrow=TRUE)
class(mtxTest)
## [1] "matrix"
mtxTest[1, ]
## [1] 1 2 3
class(mtxTest[1, ])
## [1] "integer"
mtxTest[1, , drop=FALSE]
## [,1] [,2] [,3]
## [1,] 1 2 3
class(mtxTest[1, , drop=FALSE])
## [1] "matrix"
# Behavior of sapply() - may not get what you are expecting
foo <- function(x) { x^2 }
sapply(1:5, FUN=foo)
## [1] 1 4 9 16 25
class(sapply(1:5, FUN=foo))
## [1] "numeric"
sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [1] 1.00 2.25 4.00 6.25 9.00 16.00 25.00
class(sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "numeric"
sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2.25 4.00 6.25
##
## [[3]]
## [1] 9
##
## [[4]]
## [1] 16
##
## [[5]]
## [1] 25
class(sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "list"
This was a very enjoyable and instructive course.
Chapter 1 - Introduction to Object Oriented Programming (OOP)
Typical R usage involves a functional programming style - data to function to new data to new function to newer values and etc. Object Oriented Programming (OOP) instead involves thinking about the data structures (objects), their functionalities, and the like:
There are nine different options for OOP in R:
How does R distinguish types of variables?
Assigning Classes and Implicit Classes:
Example code includes:
# Create these variables
a_numeric_vector <- rlnorm(50)
a_factor <- factor(
sample(c(LETTERS[1:5], NA), 50, replace = TRUE)
)
a_data_frame <- data.frame(
n = a_numeric_vector,
f = a_factor
)
a_linear_model <- lm(dist ~ speed, cars)
# Call summary() on the numeric vector
summary(a_numeric_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.08694 0.58120 1.06400 1.63500 1.48800 7.43600
# Do the same for the other three objects
summary(a_factor)
## A B C D E NA's
## 5 9 8 11 11 6
summary(a_data_frame)
## n f
## Min. :0.08694 A : 5
## 1st Qu.:0.58121 B : 9
## Median :1.06361 C : 8
## Mean :1.63546 D :11
## 3rd Qu.:1.48764 E :11
## Max. :7.43560 NA's: 6
summary(a_linear_model)
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
type_info <-
function(x)
{
c(
class = class(x),
typeof = typeof(x),
mode = mode(x),
storage.mode = storage.mode(x)
)
}
# Create list of example variables
some_vars <- list(
an_integer_vector = rpois(24, lambda = 5),
a_numeric_vector = rbeta(24, shape1 = 1, shape2 = 1),
an_integer_array = array(rbinom(24, size = 8, prob = 0.5), dim = c(2, 3, 4)),
a_numeric_array = array(rweibull(24, shape = 1, scale = 1), dim = c(2, 3, 4)),
a_data_frame = data.frame(int = rgeom(24, prob = 0.5), num = runif(24)),
a_factor = factor(month.abb),
a_formula = y ~ x,
a_closure_function = mean,
a_builtin_function = length,
a_special_function = `if`
)
# Loop over some_vars calling type_info() on each element to explore them
lapply(some_vars, FUN=type_info)
## $an_integer_vector
## class typeof mode storage.mode
## "integer" "integer" "numeric" "integer"
##
## $a_numeric_vector
## class typeof mode storage.mode
## "numeric" "double" "numeric" "double"
##
## $an_integer_array
## class typeof mode storage.mode
## "array" "integer" "numeric" "integer"
##
## $a_numeric_array
## class typeof mode storage.mode
## "array" "double" "numeric" "double"
##
## $a_data_frame
## class typeof mode storage.mode
## "data.frame" "list" "list" "list"
##
## $a_factor
## class typeof mode storage.mode
## "factor" "integer" "numeric" "integer"
##
## $a_formula
## class typeof mode storage.mode
## "formula" "language" "call" "language"
##
## $a_closure_function
## class typeof mode storage.mode
## "function" "closure" "function" "function"
##
## $a_builtin_function
## class typeof mode storage.mode
## "function" "builtin" "function" "function"
##
## $a_special_function
## class typeof mode storage.mode
## "function" "special" "function" "function"
whiteChess <- list(king="g1", queen="h4", bishops=c("c2", "g5"), knights=character(0), rooks=c("f1", "f6"), pawns=c("a2", "b2", "d4", "e3", "g2", "h2"))
blackChess <- list(king="g8", queen="d7", bishops=c("b7", "e7"), knights=character(0), rooks=c("a6", "f8"), pawns=c("a5", "c3", "c4", "d5", "f7", "g6"))
chess <- list(white=whiteChess, black=blackChess)
# Explore the structure of chess
str(chess)
## List of 2
## $ white:List of 6
## ..$ king : chr "g1"
## ..$ queen : chr "h4"
## ..$ bishops: chr [1:2] "c2" "g5"
## ..$ knights: chr(0)
## ..$ rooks : chr [1:2] "f1" "f6"
## ..$ pawns : chr [1:6] "a2" "b2" "d4" "e3" ...
## $ black:List of 6
## ..$ king : chr "g8"
## ..$ queen : chr "d7"
## ..$ bishops: chr [1:2] "b7" "e7"
## ..$ knights: chr(0)
## ..$ rooks : chr [1:2] "a6" "f8"
## ..$ pawns : chr [1:6] "a5" "c3" "c4" "d5" ...
# Override the class of chess
class(chess) <- "chess_game"
# Is chess still a list?
is.list(chess)
## [1] TRUE
# How many pieces are left on the board?
length(unlist(chess))
## [1] 24
type_info(chess) # note that typeof(), mode(), and storage.mode() all remained as list
## class typeof mode storage.mode
## "chess_game" "list" "list" "list"
Chapter 2 - Using S3
Function overloading is the property of a function of input-dependent behavior:
Methodical Thinking - determining which methods are available for an S3 generic:
S3 and Primitive Functions:
Too Much Class:
Example code includes:
# Create get_n_elements
get_n_elements <- function(x, ...) {
UseMethod("get_n_elements")
}
# View get_n_elements
get_n_elements
## function(x, ...) {
## UseMethod("get_n_elements")
## }
# Create a data.frame method for get_n_elements
get_n_elements.data.frame <- function(x, ...) {
nrow(x) * ncol(x)
}
# Call the method on the sleep dataset
n_elements_sleep <- get_n_elements(sleep)
# View the result
n_elements_sleep
## [1] 60
# View pre-defined objects
# ls.str() ## Do not run, this can be a cluster with many variables loaded . . .
# Create a default method for get_n_elements
get_n_elements.default <- function(x, ...) {
length(unlist(x))
}
# Call the method on the ability.cov dataset
n_elements_ability.cov <- get_n_elements(ability.cov)
# Find methods for print
methods("print")
## [1] print.acf*
## [2] print.AES*
## [3] print.anova*
## [4] print.aov*
## [5] print.aovlist*
## [6] print.ar*
## [7] print.Arima*
## [8] print.arima0*
## [9] print.AsIs
## [10] print.aspell*
## [11] print.aspell_inspect_context*
## [12] print.bibentry*
## [13] print.Bibtex*
## [14] print.browseVignettes*
## [15] print.by
## [16] print.bytes*
## [17] print.changedFiles*
## [18] print.check_code_usage_in_package*
## [19] print.check_compiled_code*
## [20] print.check_demo_index*
## [21] print.check_depdef*
## [22] print.check_dotInternal*
## [23] print.check_make_vars*
## [24] print.check_nonAPI_calls*
## [25] print.check_package_code_assign_to_globalenv*
## [26] print.check_package_code_attach*
## [27] print.check_package_code_data_into_globalenv*
## [28] print.check_package_code_startup_functions*
## [29] print.check_package_code_syntax*
## [30] print.check_package_code_unload_functions*
## [31] print.check_package_compact_datasets*
## [32] print.check_package_CRAN_incoming*
## [33] print.check_package_datasets*
## [34] print.check_package_depends*
## [35] print.check_package_description*
## [36] print.check_package_description_encoding*
## [37] print.check_package_license*
## [38] print.check_packages_in_dir*
## [39] print.check_packages_in_dir_changes*
## [40] print.check_packages_used*
## [41] print.check_po_files*
## [42] print.check_Rd_contents*
## [43] print.check_Rd_line_widths*
## [44] print.check_Rd_metadata*
## [45] print.check_Rd_xrefs*
## [46] print.check_so_symbols*
## [47] print.check_T_and_F*
## [48] print.check_url_db*
## [49] print.check_vignette_index*
## [50] print.checkDocFiles*
## [51] print.checkDocStyle*
## [52] print.checkFF*
## [53] print.checkRd*
## [54] print.checkReplaceFuns*
## [55] print.checkS3methods*
## [56] print.checkTnF*
## [57] print.checkVignettes*
## [58] print.citation*
## [59] print.codoc*
## [60] print.codocClasses*
## [61] print.codocData*
## [62] print.colorConverter*
## [63] print.compactPDF*
## [64] print.condition
## [65] print.connection
## [66] print.data.frame
## [67] print.Date
## [68] print.default
## [69] print.dendrogram*
## [70] print.density*
## [71] print.difftime
## [72] print.dist*
## [73] print.Dlist
## [74] print.DLLInfo
## [75] print.DLLInfoList
## [76] print.DLLRegisteredRoutines
## [77] print.dummy_coef*
## [78] print.dummy_coef_list*
## [79] print.ecdf*
## [80] print.factanal*
## [81] print.factor
## [82] print.family*
## [83] print.fileSnapshot*
## [84] print.findLineNumResult*
## [85] print.formula*
## [86] print.fseq*
## [87] print.ftable*
## [88] print.function
## [89] print.getAnywhere*
## [90] print.glm*
## [91] print.hclust*
## [92] print.help_files_with_topic*
## [93] print.hexmode
## [94] print.HoltWinters*
## [95] print.hsearch*
## [96] print.hsearch_db*
## [97] print.htest*
## [98] print.html*
## [99] print.infl*
## [100] print.integrate*
## [101] print.isoreg*
## [102] print.kmeans*
## [103] print.knitr_kable*
## [104] print.Latex*
## [105] print.LaTeX*
## [106] print.lazy*
## [107] print.libraryIQR
## [108] print.listof
## [109] print.lm*
## [110] print.loadings*
## [111] print.loess*
## [112] print.logLik*
## [113] print.ls_str*
## [114] print.medpolish*
## [115] print.MethodsFunction*
## [116] print.mtable*
## [117] print.NativeRoutineList
## [118] print.news_db*
## [119] print.nls*
## [120] print.noquote
## [121] print.numeric_version
## [122] print.object_size*
## [123] print.octmode
## [124] print.packageDescription*
## [125] print.packageInfo
## [126] print.packageIQR*
## [127] print.packageStatus*
## [128] print.pairwise.htest*
## [129] print.PDF_Array*
## [130] print.PDF_Dictionary*
## [131] print.pdf_doc*
## [132] print.pdf_fonts*
## [133] print.PDF_Indirect_Reference*
## [134] print.pdf_info*
## [135] print.PDF_Keyword*
## [136] print.PDF_Name*
## [137] print.PDF_Stream*
## [138] print.PDF_String*
## [139] print.person*
## [140] print.POSIXct
## [141] print.POSIXlt
## [142] print.power.htest*
## [143] print.ppr*
## [144] print.prcomp*
## [145] print.princomp*
## [146] print.proc_time
## [147] print.raster*
## [148] print.Rd*
## [149] print.recordedplot*
## [150] print.restart
## [151] print.RGBcolorConverter*
## [152] print.rle
## [153] print.roman*
## [154] print.SavedPlots*
## [155] print.sessionInfo*
## [156] print.shiny.tag*
## [157] print.shiny.tag.list*
## [158] print.simple.list
## [159] print.smooth.spline*
## [160] print.socket*
## [161] print.srcfile
## [162] print.srcref
## [163] print.stepfun*
## [164] print.stl*
## [165] print.StructTS*
## [166] print.subdir_tests*
## [167] print.summarize_CRAN_check_status*
## [168] print.summary.aov*
## [169] print.summary.aovlist*
## [170] print.summary.ecdf*
## [171] print.summary.glm*
## [172] print.summary.lm*
## [173] print.summary.loess*
## [174] print.summary.manova*
## [175] print.summary.nls*
## [176] print.summary.packageStatus*
## [177] print.summary.ppr*
## [178] print.summary.prcomp*
## [179] print.summary.princomp*
## [180] print.summary.table
## [181] print.summaryDefault
## [182] print.table
## [183] print.tables_aov*
## [184] print.terms*
## [185] print.ts*
## [186] print.tskernel*
## [187] print.TukeyHSD*
## [188] print.tukeyline*
## [189] print.tukeysmooth*
## [190] print.undoc*
## [191] print.vignette*
## [192] print.warnings
## [193] print.xgettext*
## [194] print.xngettext*
## [195] print.xtabs*
## see '?methods' for accessing help and source code
# Commented due to no dataset "hair" on my machine
# View the structure of hair
# str(hair)
# What primitive generics are available?
.S3PrimitiveGenerics
## [1] "anyNA" "as.character" "as.complex" "as.double"
## [5] "as.environment" "as.integer" "as.logical" "as.numeric"
## [9] "as.raw" "c" "dim" "dim<-"
## [13] "dimnames" "dimnames<-" "is.array" "is.finite"
## [17] "is.infinite" "is.matrix" "is.na" "is.nan"
## [21] "is.numeric" "length" "length<-" "levels<-"
## [25] "names" "names<-" "rep" "seq.int"
## [29] "xtfrm"
# Does length.hairstylist exist?
# exists("length.hairstylist")
# What is the length of hair?
# length(hair)
kitty <- "Miaow!"
# Assign classes
class(kitty) <- c("cat", "mammal", "character")
# Does kitty inherit from cat/mammal/character vector?
inherits(kitty, "cat")
## [1] TRUE
inherits(kitty, "mammal")
## [1] TRUE
inherits(kitty, "character")
## [1] TRUE
# Is kitty a character vector?
is.character(kitty)
## [1] TRUE
# Does kitty inherit from dog?
inherits(kitty, "dog")
## [1] FALSE
what_am_i <-
function(x, ...)
{
UseMethod("what_am_i")
}
# cat method
what_am_i.cat <- function(x, ...)
{
# Write a message
print("I'm a cat")
# Call NextMethod
NextMethod("what_am_i")
}
# mammal method
what_am_i.mammal <- function(x, ...)
{
# Write a message
print("I'm a mammal")
# Call NextMethod
NextMethod("what_am_i")
}
# character method
what_am_i.character <- function(x, ...)
{
# Write a message
print("I'm a character vector")
}
# Call what_am_i()
what_am_i(kitty)
## [1] "I'm a cat"
## [1] "I'm a mammal"
## [1] "I'm a character vector"
Chapter 3 - Using R6
Object factory - R6 provides a means of storing data and objects within the same variable:
Hiding Complexity with Encapsulation - should be able to use something even if the internal (hidden) functionality is very complicated:
Generally, data available in the “private” area of a class is not available to users:
Example code includes:
# Define microwave_oven_factory
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private=list(power_rating_watts=800)
)
# View the microwave_oven_factory
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## clone: function
## Private:
## power_rating_watts: 800
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Make a new microwave oven
microwave_oven <- microwave_oven_factory$new()
# Add a cook method to the factory definition
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
}
)
)
# Create microwave oven object
a_microwave_oven <- microwave_oven_factory$new()
# Call cook method for 1 second
a_microwave_oven$cook(time_seconds=1)
## [1] "Your food is cooked!"
# Add a close_door() method
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800,
door_is_open = FALSE
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
},
open_door = function() {
private$door_is_open = TRUE
},
close_door = function() {
private$door_is_open = FALSE
}
)
)
# Add an initialize method
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800,
door_is_open = FALSE
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
},
open_door = function() {
private$door_is_open = TRUE
},
close_door = function() {
private$door_is_open = FALSE
},
# Add initialize() method here
initialize = function(power_rating_watts, door_is_open) {
if (!missing(power_rating_watts)) {
private$power_rating_watts <- power_rating_watts
}
if (!missing(door_is_open)) {
private$door_is_open <- door_is_open
}
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new(power_rating_watts=650, door_is_open=TRUE)
# Add a binding for power rating
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
..power_rating_watts = 800
),
active = list(
# add the binding here
power_rating_watts = function() {
private$..power_rating_watts
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new()
# Get the power rating
a_microwave_oven$power_rating_watts
## [1] 800
# Add a binding for power rating
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
..power_rating_watts = 800,
..power_level_watts = 800
),
# Add active list containing an active binding
active=list(
power_level_watts = function(value) {
if (missing(value)) {
private$..power_level_watts
} else {
assertive.types::assert_is_a_number(value, severity="warning")
assertive.numbers::assert_all_are_in_closed_range(value,
lower=0,
upper=private$..power_rating_watts,
severity="warning"
)
private$..power_level_watts <- value
}
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new()
# Get the power level
a_microwave_oven$power_level_watts
## [1] 800
# Try to set the power level to "400"
a_microwave_oven$power_level_watts <- "400"
## Warning in (function (value) : is_a_number : value is not of class
## 'numeric'; it has class 'character'.
## Warning: Coercing value to class 'numeric'.
# Try to set the power level to 1600 watts
a_microwave_oven$power_level_watts <- 1600
## Warning in (function (value) : is_in_closed_range : value are not all in the range [0,800].
## There was 1 failure:
## Position Value Cause
## 1 1 1600 too high
# Set the power level to 400 watts
a_microwave_oven$power_level_watts <- 400
Chapter 4 - R6 Inheritance
Inheritance is an attempt to avoid “copy and paste” from one class to another (dependent, fancier, or the like) class:
Extend or Override to create additional functionality:
Multiple Levels of Inheritance - a can inherit from b that inherited from c and the like:
Example code includes:
microwave_oven_factory <-
R6::R6Class("MicrowaveOven",
private=list(..power_rating_watts=800,
..power_level_watts=800,
..door_is_open=FALSE
),
public=list(cook=function(time) Sys.sleep(time),
open_door=function() private$..door_is_open <- TRUE,
close_door = function() private$..door_is_open <- FALSE
),
active=list(power_rating_watts=function() private$..power_rating_watts,
power_level_watts = function(value) {
if (missing(value)) {
private$..power_level_watts
} else {
private$..power_level_watts <-
max(0,
min(private$..power_rating_watts,
as.numeric(value)
)
)
}
}
)
)
# Explore the microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function
## open_door: function
## close_door: function
## clone: function
## Active bindings:
## power_rating_watts: function
## power_level_watts: function
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Define a fancy microwave class inheriting from microwave oven
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit=microwave_oven_factory
)
# Explore microwave oven classes
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function
## open_door: function
## close_door: function
## clone: function
## Active bindings:
## power_rating_watts: function
## power_level_watts: function
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
fancy_microwave_oven_factory
## <FancyMicrowaveOven> object generator
## Public:
## clone: function
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Instantiate both types of microwave
a_microwave_oven <- microwave_oven_factory$new()
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Get power rating for each microwave
microwave_power_rating <- a_microwave_oven$power_level_watts
fancy_microwave_power_rating <- a_fancy_microwave$power_level_watts
# Verify that these are the same
identical(microwave_power_rating, fancy_microwave_power_rating)
## [1] TRUE
# Cook with each microwave
a_microwave_oven$cook(1)
a_fancy_microwave$cook(1)
# Explore microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function
## open_door: function
## close_door: function
## clone: function
## Active bindings:
## power_rating_watts: function
## power_level_watts: function
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Extend the class definition
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
# Add a public list with a cook baked potato method
public = list(
cook_baked_potato=function() {
self$cook(3)
}
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the cook_baked_potato() method
a_fancy_microwave$cook_baked_potato()
# Explore microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function
## open_door: function
## close_door: function
## clone: function
## Active bindings:
## power_rating_watts: function
## power_level_watts: function
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Update the class definition
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
# Add a public list with a cook method
public = list(
cook = function(time_seconds) {
super$cook(time_seconds)
message("Enjoy your dinner!")
}
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the cook() method
a_fancy_microwave$cook(1)
## Enjoy your dinner!
# Expose the parent functionality
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
public = list(
cook_baked_potato = function() {
self$cook(3)
},
cook = function(time_seconds) {
super$cook(time_seconds)
message("Enjoy your dinner!")
}
),
# Add an active element with a super_ binding
active = list(
super_ = function() super
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the super_ binding
a_fancy_microwave$super_
## <environment: 0x000000000b667e68>
ascii_pizza_slice <- " __\n // \"\"--.._\n|| (_) _ \"-._\n|| _ (_) '-.\n|| (_) __..-'\n \\\\__..--\"\""
# Explore other microwaves
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function
## open_door: function
## close_door: function
## clone: function
## Active bindings:
## power_rating_watts: function
## power_level_watts: function
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
fancy_microwave_oven_factory
## <FancyMicrowaveOven> object generator
## Public:
## cook_baked_potato: function
## cook: function
## clone: function
## Active bindings:
## super_: function
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Define a high-end microwave oven class
high_end_microwave_oven_factory <- R6::R6Class(
"HighEndMicrowaveOven",
inherit=fancy_microwave_oven_factory,
public=list(
cook=function(time_seconds) {
super$super_$cook(time_seconds)
message(ascii_pizza_slice)
}
)
)
# Instantiate a high-end microwave oven
a_high_end_microwave <- high_end_microwave_oven_factory$new()
# Use it to cook for one second
a_high_end_microwave$cook(1)
## __
## // ""--.._
## || (_) _ "-._
## || _ (_) '-.
## || (_) __..-'
## \\__..--""
Chapter 5 - Advanced R6 Usage
Environments, Reference Behavior, and Static Fields:
Cloning Objects - R6 is built using environments, so the “copy by reference” is part and parcel of R6:
Shut it Down - if the R6 object is linked to any databases or has any side effects, it can be a good idea to shut it down:
Example code includes:
# Define a new environment
env <- new.env()
# Add an element named perfect
env$perfect <- c(6, 28, 496)
# Add an element named bases
env[["bases"]] <- c("A", "C", "G", "T")
# Assign lst and env
lst <- list(
perfect = c(6, 28, 496),
bases = c("A", "C", "G", "T")
)
env <- list2env(lst)
# Copy lst
lst2 <- lst
# Change lst's bases element
lst$bases <- c("A", "C", "G", "U")
# Test lst and lst2 identical
identical(lst$bases, lst2$bases)
## [1] FALSE
# Copy env
env2 <- env
# Change env's bases element
env$bases <- c("A", "C", "G", "U")
# Test env and env2 identical
identical(env$bases, env2$bases)
## [1] TRUE
# Complete the class definition
env_microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
shared = {
# Create a new environment named e
e <- new.env()
# Assign safety_warning into e
e$safety_warning <- "Warning. Do not try to cook metal objects."
# Return e
e
}
),
active = list(
# Add the safety_warning binding
safety_warning = function(value) {
if (missing(value)) {
private$shared$safety_warning
} else {
private$shared$safety_warning <- value
}
}
)
)
# Create two microwave ovens
a_microwave_oven <- env_microwave_oven_factory$new()
another_microwave_oven <- env_microwave_oven_factory$new()
# Change the safety warning for a_microwave_oven
a_microwave_oven$safety_warning <- "Warning. If the food is too hot you may scald yourself."
# Verify that the warning has change for another_microwave
another_microwave_oven$safety_warning
## [1] "Warning. If the food is too hot you may scald yourself."
# Still uses microwave_oven_factory as defined in Chapter 4
# Create a microwave oven
a_microwave_oven <- microwave_oven_factory$new()
# Copy a_microwave_oven using <-
assigned_microwave_oven <- a_microwave_oven
# Copy a_microwave_oven using clone()
cloned_microwave_oven <- a_microwave_oven$clone()
# Change a_microwave_oven's power level
a_microwave_oven$power_level_watts <- 400
# Check a_microwave_oven & assigned_microwave_oven same
identical(a_microwave_oven$power_level_watts, assigned_microwave_oven$power_level_watts)
## [1] TRUE
# Check a_microwave_oven & cloned_microwave_oven different
!identical(a_microwave_oven$power_level_watts, cloned_microwave_oven$power_level_watts)
## [1] TRUE
# Commented, due to never defined power_plug
# Create a microwave oven
# a_microwave_oven <- microwave_oven_factory$new()
# Look at its power plug
# a_microwave_oven$power_plug
# Copy a_microwave_oven using clone(), no args
# cloned_microwave_oven <- a_microwave_oven$clone()
# Copy a_microwave_oven using clone(), deep = TRUE
# deep_cloned_microwave_oven <- a_microwave_oven$clone(deep=TRUE)
# Change a_microwave_oven's power plug type
# a_microwave_oven$power_plug$type <- "British"
# Check a_microwave_oven & cloned_microwave_oven same
# identical(a_microwave_oven$power_plug$type, cloned_microwave_oven$power_plug$type)
# Check a_microwave_oven & deep_cloned_microwave_oven different
# !identical(a_microwave_oven$power_plug$type, deep_cloned_microwave_oven$power_plug$type)
# Commented due to not having this SQL database
# Microwave_factory is pre-defined
# microwave_oven_factory
# Complete the class definition
# smart_microwave_oven_factory <- R6::R6Class(
# "SmartMicrowaveOven",
# inherit = microwave_oven_factory, # Specify inheritance
# private = list(
# conn = NULL
# ),
# public = list(
# initialize = function() {
# # Connect to the database
# private$conn = dbConnect(SQLite(), "cooking-times.sqlite")
# },
# get_cooking_time = function(food) {
# dbGetQuery(
# private$conn,
# sprintf("SELECT time_seconds FROM cooking_times WHERE food = '%s'", food)
# )
# },
# finalize = function() {
# message("Disconnecting from the cooking times database.")
# dbDisconnect(private$conn)
# }
# )
# )
# Create a smart microwave object
# a_smart_microwave <- smart_microwave_oven_factory$new()
# Call the get_cooking_time() method
# a_smart_microwave$get_cooking_time("soup")
# Remove the smart microwave
# rm(a_smart_microwave)
# Force garbage collection
# gc()
A nice introduction to S3 and R6.
The library(dplyr) is a grammar of data manipulation. It is written in C++ so you get the speed of C with the convenience of R. It is in essence the data frame to data frame portion of plyr (plyr was the original Split-Apply-Combine). May want to look in to count, transmute, and other verbs added post this summary.
The examples use data(hflights) from library(hflights):
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:purrr':
##
## contains, order_by
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hflights)
data(hflights)
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
summary(hflights)
## Year Month DayofMonth DayOfWeek
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2011 Median : 7.000 Median :16.00 Median :4.000
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime ArrTime UniqueCarrier FlightNum
## Min. : 1 Min. : 1 Length:227496 Min. : 1
## 1st Qu.:1021 1st Qu.:1215 Class :character 1st Qu.: 855
## Median :1416 Median :1617 Mode :character Median :1696
## Mean :1396 Mean :1578 Mean :1962
## 3rd Qu.:1801 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :2400 Max. :7290
## NA's :2905 NA's :3066
## TailNum ActualElapsedTime AirTime ArrDelay
## Length:227496 Min. : 34.0 Min. : 11.0 Min. :-70.000
## Class :character 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000
## Mode :character Median :128.0 Median :107.0 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000
## Max. :575.0 Max. :549.0 Max. :978.000
## NA's :3622 NA's :3622 NA's :3622
## DepDelay Origin Dest Distance
## Min. :-33.000 Length:227496 Length:227496 Min. : 79.0
## 1st Qu.: -3.000 Class :character Class :character 1st Qu.: 376.0
## Median : 0.000 Mode :character Mode :character Median : 809.0
## Mean : 9.445 Mean : 787.8
## 3rd Qu.: 9.000 3rd Qu.:1042.0
## Max. :981.000 Max. :3904.0
## NA's :2905
## TaxiIn TaxiOut Cancelled CancellationCode
## Min. : 1.000 Min. : 1.00 Min. :0.00000 Length:227496
## 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0.00000 Class :character
## Median : 5.000 Median : 14.00 Median :0.00000 Mode :character
## Mean : 6.099 Mean : 15.09 Mean :0.01307
## 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0.00000
## Max. :165.000 Max. :163.00 Max. :1.00000
## NA's :3066 NA's :2947
## Diverted
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.002853
## 3rd Qu.:0.000000
## Max. :1.000000
##
The “tbl” is a special type of data frame, which is very helpful for printing:
An interesting way to do a lookup table:
See for example:
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
"DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
"WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
"FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa"
)
hflights$Carrier <- lut[hflights$UniqueCarrier]
glimpse(hflights)
## Observations: 227,496
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Carrier <chr> "American", "American", "American", "America...
There are five main verbs in dplyr:
There is also the group_by capability for summaries of sub-groups:
The dplyr library can also work with databases. It only loads the data that you need, and you do not need to know the relevant SQL code – dplyr writes the SQL code for you.
Basic select and mutate examples include:
data(hflights)
# Make it faster, as well as a prettier printer
hflights <- tbl_df(hflights)
hflights
## # A tibble: 227,496 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## * <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## # ... with 227,486 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
class(hflights)
## [1] "tbl_df" "tbl" "data.frame"
# Select examples
select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)
## # A tibble: 227,496 × 4
## ActualElapsedTime AirTime ArrDelay DepDelay
## * <int> <int> <int> <int>
## 1 60 40 -10 0
## 2 60 45 -9 1
## 3 70 48 -8 -8
## 4 70 39 3 3
## 5 62 44 -3 5
## 6 64 45 -7 -1
## 7 70 43 -1 -1
## 8 59 40 -16 -5
## 9 71 41 44 43
## 10 70 45 43 43
## # ... with 227,486 more rows
select(hflights, Origin:Cancelled)
## # A tibble: 227,496 × 6
## Origin Dest Distance TaxiIn TaxiOut Cancelled
## * <chr> <chr> <int> <int> <int> <int>
## 1 IAH DFW 224 7 13 0
## 2 IAH DFW 224 6 9 0
## 3 IAH DFW 224 5 17 0
## 4 IAH DFW 224 9 22 0
## 5 IAH DFW 224 9 9 0
## 6 IAH DFW 224 6 13 0
## 7 IAH DFW 224 12 15 0
## 8 IAH DFW 224 7 12 0
## 9 IAH DFW 224 8 22 0
## 10 IAH DFW 224 6 19 0
## # ... with 227,486 more rows
select(hflights, Year:DayOfWeek, ArrDelay:Diverted)
## # A tibble: 227,496 × 14
## Year Month DayofMonth DayOfWeek ArrDelay DepDelay Origin Dest
## * <int> <int> <int> <int> <int> <int> <chr> <chr>
## 1 2011 1 1 6 -10 0 IAH DFW
## 2 2011 1 2 7 -9 1 IAH DFW
## 3 2011 1 3 1 -8 -8 IAH DFW
## 4 2011 1 4 2 3 3 IAH DFW
## 5 2011 1 5 3 -3 5 IAH DFW
## 6 2011 1 6 4 -7 -1 IAH DFW
## 7 2011 1 7 5 -1 -1 IAH DFW
## 8 2011 1 8 6 -16 -5 IAH DFW
## 9 2011 1 9 7 44 43 IAH DFW
## 10 2011 1 10 1 43 43 IAH DFW
## # ... with 227,486 more rows, and 6 more variables: Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
select(hflights, ends_with("Delay"))
## # A tibble: 227,496 × 2
## ArrDelay DepDelay
## * <int> <int>
## 1 -10 0
## 2 -9 1
## 3 -8 -8
## 4 3 3
## 5 -3 5
## 6 -7 -1
## 7 -1 -1
## 8 -16 -5
## 9 44 43
## 10 43 43
## # ... with 227,486 more rows
select(hflights, UniqueCarrier, ends_with("Num"), starts_with("Cancel"))
## # A tibble: 227,496 × 5
## UniqueCarrier FlightNum TailNum Cancelled CancellationCode
## * <chr> <int> <chr> <int> <chr>
## 1 AA 428 N576AA 0
## 2 AA 428 N557AA 0
## 3 AA 428 N541AA 0
## 4 AA 428 N403AA 0
## 5 AA 428 N492AA 0
## 6 AA 428 N262AA 0
## 7 AA 428 N493AA 0
## 8 AA 428 N477AA 0
## 9 AA 428 N476AA 0
## 10 AA 428 N504AA 0
## # ... with 227,486 more rows
select(hflights, ends_with("Time"), ends_with("Delay"))
## # A tibble: 227,496 × 6
## DepTime ArrTime ActualElapsedTime AirTime ArrDelay DepDelay
## * <int> <int> <int> <int> <int> <int>
## 1 1400 1500 60 40 -10 0
## 2 1401 1501 60 45 -9 1
## 3 1352 1502 70 48 -8 -8
## 4 1403 1513 70 39 3 3
## 5 1405 1507 62 44 -3 5
## 6 1359 1503 64 45 -7 -1
## 7 1359 1509 70 43 -1 -1
## 8 1355 1454 59 40 -16 -5
## 9 1443 1554 71 41 44 43
## 10 1443 1553 70 45 43 43
## # ... with 227,486 more rows
# Mutate example
m1 <- mutate(hflights, loss = ArrDelay - DepDelay, loss_ratio = loss / DepDelay)
class(m1)
## [1] "tbl_df" "tbl" "data.frame"
m1
## # A tibble: 227,496 × 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## # ... with 227,486 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, loss <int>, loss_ratio <dbl>
glimpse(m1)
## Observations: 227,496
## Variables: 23
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ loss <int> -10, -10, 0, 0, -8, -6, 0, -11, 1, 0, 0, -14...
## $ loss_ratio <dbl> -Inf, -10.00000000, 0.00000000, 0.00000000, ...
Additionally, examples for filter and arrange:
# Examples for filter
filter(hflights, Distance >= 3000) # All flights that traveled 3000 miles or more
## # A tibble: 527 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 31 1 924 1413 CO
## 2 2011 1 30 7 925 1410 CO
## 3 2011 1 29 6 1045 1445 CO
## 4 2011 1 28 5 1516 1916 CO
## 5 2011 1 27 4 950 1344 CO
## 6 2011 1 26 3 944 1350 CO
## 7 2011 1 25 2 924 1337 CO
## 8 2011 1 24 1 1144 1605 CO
## 9 2011 1 23 7 926 1335 CO
## 10 2011 1 22 6 942 1340 CO
## # ... with 517 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
filter(hflights, UniqueCarrier %in% c("B6", "WN", "DL"))
## # A tibble: 48,679 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 654 1124 B6
## 2 2011 1 1 6 1639 2110 B6
## 3 2011 1 2 7 703 1113 B6
## 4 2011 1 2 7 1604 2040 B6
## 5 2011 1 3 1 659 1100 B6
## 6 2011 1 3 1 1801 2200 B6
## 7 2011 1 4 2 654 1103 B6
## 8 2011 1 4 2 1608 2034 B6
## 9 2011 1 5 3 700 1103 B6
## 10 2011 1 5 3 1544 1954 B6
## # ... with 48,669 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
filter(hflights, (TaxiIn + TaxiOut) > AirTime) # Flights where taxiing took longer than flying
## # A tibble: 1,389 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 24 1 731 904 AA
## 2 2011 1 30 7 1959 2132 AA
## 3 2011 1 24 1 1621 1749 AA
## 4 2011 1 10 1 941 1113 AA
## 5 2011 1 31 1 1301 1356 CO
## 6 2011 1 31 1 2113 2215 CO
## 7 2011 1 31 1 1434 1539 CO
## 8 2011 1 31 1 900 1006 CO
## 9 2011 1 30 7 1304 1408 CO
## 10 2011 1 30 7 2004 2128 CO
## # ... with 1,379 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
filter(hflights, DepTime < 500 | ArrTime > 2200) # Flights departed before 5am or arrived after 10pm
## # A tibble: 27,799 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 4 2 2100 2207 AA
## 2 2011 1 14 5 2119 2229 AA
## 3 2011 1 10 1 1934 2235 AA
## 4 2011 1 26 3 1905 2211 AA
## 5 2011 1 30 7 1856 2209 AA
## 6 2011 1 9 7 1938 2228 AS
## 7 2011 1 31 1 1919 2231 CO
## 8 2011 1 31 1 2116 2344 CO
## 9 2011 1 31 1 1850 2211 CO
## 10 2011 1 31 1 2102 2216 CO
## # ... with 27,789 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
filter(hflights, DepDelay > 0, ArrDelay < 0) # Flights that departed late but arrived ahead of schedule
## # A tibble: 27,712 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 2 7 1401 1501 AA
## 2 2011 1 5 3 1405 1507 AA
## 3 2011 1 18 2 1408 1508 AA
## 4 2011 1 18 2 721 827 AA
## 5 2011 1 12 3 2015 2113 AA
## 6 2011 1 13 4 2020 2116 AA
## 7 2011 1 26 3 2009 2103 AA
## 8 2011 1 1 6 1631 1736 AA
## 9 2011 1 10 1 1639 1740 AA
## 10 2011 1 12 3 1631 1739 AA
## # ... with 27,702 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
filter(hflights, Cancelled == 1, DepDelay > 0) # Flights that were cancelled after being delayed
## # A tibble: 40 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 26 3 1926 NA CO
## 2 2011 1 11 2 1100 NA US
## 3 2011 1 19 3 1811 NA XE
## 4 2011 1 7 5 2028 NA XE
## 5 2011 2 4 5 1638 NA AA
## 6 2011 2 8 2 1057 NA CO
## 7 2011 2 2 3 802 NA XE
## 8 2011 2 9 3 904 NA XE
## 9 2011 2 1 2 1508 NA OO
## 10 2011 3 31 4 1016 NA CO
## # ... with 30 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
c1 <- filter(hflights, Dest == "JFK") # Flights that had JFK as their destination: c1
c2 <- mutate(c1, Date = paste(Year, Month, DayofMonth, sep="-")) # Create a Date column: c2
select(c2, Date, DepTime, ArrTime, TailNum) # Print out a selection of columns of c2
## # A tibble: 695 × 4
## Date DepTime ArrTime TailNum
## <chr> <int> <int> <chr>
## 1 2011-1-1 654 1124 N324JB
## 2 2011-1-1 1639 2110 N324JB
## 3 2011-1-2 703 1113 N324JB
## 4 2011-1-2 1604 2040 N324JB
## 5 2011-1-3 659 1100 N229JB
## 6 2011-1-3 1801 2200 N206JB
## 7 2011-1-4 654 1103 N267JB
## 8 2011-1-4 1608 2034 N267JB
## 9 2011-1-5 700 1103 N708JB
## 10 2011-1-5 1544 1954 N644JB
## # ... with 685 more rows
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay)) # Definition of dtc
# Examples for arrange
arrange(dtc, DepDelay) # Arrange dtc by departure delays
## # A tibble: 68 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 7 23 6 605 NA F9
## 2 2011 1 17 1 916 NA XE
## 3 2011 12 1 4 541 NA US
## 4 2011 10 12 3 2022 NA MQ
## 5 2011 7 29 5 1424 NA CO
## 6 2011 9 29 4 1639 NA OO
## 7 2011 2 9 3 555 NA MQ
## 8 2011 5 9 1 715 NA OO
## 9 2011 1 20 4 1413 NA UA
## 10 2011 1 17 1 831 NA WN
## # ... with 58 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
arrange(dtc, CancellationCode) # Arrange dtc so that cancellation reasons are grouped
## # A tibble: 68 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 20 4 1413 NA UA
## 2 2011 1 7 5 2028 NA XE
## 3 2011 2 4 5 1638 NA AA
## 4 2011 2 8 2 1057 NA CO
## 5 2011 2 1 2 1508 NA OO
## 6 2011 2 21 1 2257 NA OO
## 7 2011 2 9 3 555 NA MQ
## 8 2011 3 18 5 727 NA UA
## 9 2011 4 4 1 1632 NA DL
## 10 2011 4 8 5 1608 NA WN
## # ... with 58 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
arrange(dtc, UniqueCarrier, DepDelay) # Arrange dtc according to carrier and departure delays
## # A tibble: 68 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 8 18 4 1808 NA AA
## 2 2011 2 4 5 1638 NA AA
## 3 2011 7 29 5 1424 NA CO
## 4 2011 1 26 3 1703 NA CO
## 5 2011 8 11 4 1320 NA CO
## 6 2011 7 25 1 1654 NA CO
## 7 2011 1 26 3 1926 NA CO
## 8 2011 3 31 4 1016 NA CO
## 9 2011 2 8 2 1057 NA CO
## 10 2011 4 4 1 1632 NA DL
## # ... with 58 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
arrange(hflights, UniqueCarrier, desc(DepDelay)) # Arrange by carrier and decreasing departure delays
## # A tibble: 227,496 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 12 12 1 650 808 AA
## 2 2011 11 19 6 1752 1910 AA
## 3 2011 12 22 4 1728 1848 AA
## 4 2011 10 23 7 2305 2 AA
## 5 2011 9 27 2 1206 1300 AA
## 6 2011 3 17 4 1647 1747 AA
## 7 2011 6 21 2 955 1315 AA
## 8 2011 5 20 5 2359 130 AA
## 9 2011 4 19 2 2023 2142 AA
## 10 2011 5 12 4 2133 53 AA
## # ... with 227,486 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
arrange(hflights, DepDelay + ArrDelay) # Arrange flights by total delay (normal order)
## # A tibble: 227,496 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 7 3 7 1914 2039 XE
## 2 2011 8 31 3 934 1039 OO
## 3 2011 8 21 7 935 1039 OO
## 4 2011 8 28 7 2059 2206 OO
## 5 2011 8 29 1 935 1041 OO
## 6 2011 12 25 7 741 926 OO
## 7 2011 1 30 7 620 812 OO
## 8 2011 8 3 3 1741 1810 XE
## 9 2011 8 4 4 930 1041 OO
## 10 2011 8 18 4 939 1043 OO
## # ... with 227,486 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
Additionally, examples for the summarize verb:
# Print out a summary with variables min_dist and max_dist
summarize(hflights, min_dist = min(Distance), max_dist = max(Distance))
## # A tibble: 1 × 2
## min_dist max_dist
## <int> <int>
## 1 79 3904
# Print out a summary with variable max_div
summarize(filter(hflights, Diverted == 1), max_div = max(Distance))
## # A tibble: 1 × 1
## max_div
## <int>
## 1 3904
# Remove rows that have NA ArrDelay: temp1
temp1 <- filter(hflights, !is.na(ArrDelay))
# Generate summary about ArrDelay column of temp1
summarize(temp1, earliest=min(ArrDelay), average=mean(ArrDelay), latest=max(ArrDelay), sd=sd(ArrDelay))
## # A tibble: 1 × 4
## earliest average latest sd
## <int> <dbl> <int> <dbl>
## 1 -70 7.094334 978 30.70852
# Keep rows that have no NA TaxiIn and no NA TaxiOut: temp2
temp2 <- filter(hflights, !is.na(TaxiIn), !is.na(TaxiOut))
# Print the maximum taxiing difference of temp2 with summarise()
summarize(temp2, max_taxi_diff = max(abs(TaxiIn - TaxiOut)))
## # A tibble: 1 × 1
## max_taxi_diff
## <int>
## 1 160
# Generate summarizing statistics for hflights
summarize(hflights, n_obs = n(), n_carrier = n_distinct(UniqueCarrier), n_dest = n_distinct(Dest))
## # A tibble: 1 × 3
## n_obs n_carrier n_dest
## <int> <int> <int>
## 1 227496 15 116
# All American Airline flights
aa <- filter(hflights, UniqueCarrier == "AA")
# Generate summarizing statistics for aa
summarize(aa, n_flights = n(), n_canc = sum(Cancelled), avg_delay = mean(ArrDelay, na.rm=TRUE))
## # A tibble: 1 × 3
## n_flights n_canc avg_delay
## <int> <int> <dbl>
## 1 3244 60 0.8917558
Additionally, examples for the pipe/chain as per magrittr:
# Find the average delta in taxi times
hflights %>%
mutate(diff = (TaxiOut - TaxiIn)) %>%
filter(!is.na(diff)) %>%
summarize(avg = mean(diff))
## # A tibble: 1 × 1
## avg
## <dbl>
## 1 8.992064
# Find flights that average less than 70 mph assuming 100 wasted minutes per flight
hflights %>%
mutate(RealTime = ActualElapsedTime + 100, mph = 60 * Distance / RealTime) %>%
filter(!is.na(mph), mph < 70) %>%
summarize(n_less = n(), n_dest = n_distinct(Dest), min_dist = min(Distance), max_dist = max(Distance))
## # A tibble: 1 × 4
## n_less n_dest min_dist max_dist
## <int> <int> <int> <int>
## 1 6726 13 79 305
# Find flights that average less than 105 mph, or that are diverted/cancelled
hflights %>%
mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>%
filter(mph < 105 | Cancelled == 1 | Diverted == 1) %>%
summarize(n_non = n(), n_dest = n_distinct(Dest), min_dist = min(Distance), max_dist = max(Distance))
## # A tibble: 1 × 4
## n_non n_dest min_dist max_dist
## <int> <int> <int> <int>
## 1 42400 113 79 3904
# Find overnight flights
filter(hflights, !is.na(DepTime), !is.na(ArrTime), DepTime > ArrTime) %>%
summarize(num = n())
## # A tibble: 1 × 1
## num
## <int>
## 1 2718
There is also the group_by capability, typically for use with summarize:
# Make an ordered per-carrier summary of hflights
group_by(hflights, UniqueCarrier) %>%
summarize(p_canc = 100 * mean(Cancelled, na.rm=TRUE), avg_delay = mean(ArrDelay, na.rm=TRUE)) %>%
arrange(avg_delay, p_canc)
## # A tibble: 15 × 3
## UniqueCarrier p_canc avg_delay
## <chr> <dbl> <dbl>
## 1 US 1.1268986 -0.6307692
## 2 AA 1.8495684 0.8917558
## 3 FL 0.9817672 1.8536239
## 4 AS 0.0000000 3.1923077
## 5 YV 1.2658228 4.0128205
## 6 DL 1.5903067 6.0841374
## 7 CO 0.6782614 6.0986983
## 8 MQ 2.9044750 7.1529751
## 9 EV 3.4482759 7.2569543
## 10 WN 1.5504047 7.5871430
## 11 F9 0.7159905 7.6682692
## 12 XE 1.5495599 8.1865242
## 13 OO 1.3946828 8.6934922
## 14 B6 2.5899281 9.8588410
## 15 UA 1.6409266 10.4628628
# Ordered overview of average arrival delays per carrier
hflights %>%
filter(!is.na(ArrDelay), ArrDelay > 0) %>%
group_by(UniqueCarrier) %>%
summarize(avg = mean(ArrDelay)) %>%
mutate(rank = rank(avg)) %>%
arrange(rank)
## # A tibble: 15 × 3
## UniqueCarrier avg rank
## <chr> <dbl> <dbl>
## 1 YV 18.67568 1
## 2 F9 18.68683 2
## 3 US 20.70235 3
## 4 CO 22.13374 4
## 5 AS 22.91195 5
## 6 OO 24.14663 6
## 7 XE 24.19337 7
## 8 WN 25.27750 8
## 9 FL 27.85693 9
## 10 AA 28.49740 10
## 11 DL 32.12463 11
## 12 UA 32.48067 12
## 13 MQ 38.75135 13
## 14 EV 40.24231 14
## 15 B6 45.47744 15
# How many airplanes only flew to one destination?
hflights %>%
group_by(TailNum) %>%
summarise(destPerTail = n_distinct(Dest)) %>%
filter(destPerTail == 1) %>%
summarise(nplanes=n())
## # A tibble: 1 × 1
## nplanes
## <int>
## 1 1526
# Find the most visited destination for each carrier
hflights %>%
group_by(UniqueCarrier, Dest) %>%
summarise(n = n()) %>%
mutate(rank = rank(-n)) %>%
filter(rank == 1)
## Source: local data frame [15 x 4]
## Groups: UniqueCarrier [15]
##
## UniqueCarrier Dest n rank
## <chr> <chr> <int> <dbl>
## 1 AA DFW 2105 1
## 2 AS SEA 365 1
## 3 B6 JFK 695 1
## 4 CO EWR 3924 1
## 5 DL ATL 2396 1
## 6 EV DTW 851 1
## 7 F9 DEN 837 1
## 8 FL ATL 2029 1
## 9 MQ DFW 2424 1
## 10 OO COS 1335 1
## 11 UA SFO 643 1
## 12 US CLT 2212 1
## 13 WN DAL 8243 1
## 14 XE CRP 3175 1
## 15 YV CLT 71 1
# Use summarise to calculate n_carrier
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, last
## The following object is masked from 'package:purrr':
##
## transpose
hflights2 <- as.data.table(hflights)
hflights2 %>%
summarize(n_carrier = n_distinct(UniqueCarrier))
## n_carrier
## 1 15
And, dplyr can be used with databases, including writing the SQL query that matches to the dplyr request. The results are cached to avoid constantly pinging the server:
# Set up a connection to the mysql database
my_db <- src_mysql(dbname = "dplyr",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Reference a table within that source: nycflights
nycflights <- tbl(my_db, "dplyr")
# glimpse at nycflights
glimpse(nycflights)
## Observations: 336,776
## Variables: 17
## $ id (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ year (int) 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
## $ month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ day (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dep_time (int) 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55...
## $ dep_delay (int) 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,...
## $ arr_time (int) 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8...
## $ arr_delay (int) 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,...
## $ carrier (chr) "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"...
## $ tailnum (chr) "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N...
## $ flight (int) 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301...
## $ origin (chr) "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG...
## $ dest (chr) "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA...
## $ air_time (int) 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149...
## $ distance (int) 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73...
## $ hour (int) 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6...
## $ minute (int) 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, ...
# Ordered, grouped summary of nycflights
nycflights %>%
group_by(carrier) %>%
summarize(n_flights = n(), avg_delay = mean(arr_delay)) %>%
arrange(avg_delay)
## Source: mysql 5.6.23-log [student@courses.csrrinzqubik.us-east-1.rds.amazonaws.com:/dplyr]
## From: <derived table> [?? x 3]
## Arrange: avg_delay
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
## carrier n_flights avg_delay
## (chr) (dbl) (dbl)
## 1 AS 714 -9.8613
## 2 HA 342 -6.9152
## 3 AA 32729 0.3556
## 4 DL 48110 1.6289
## 5 VX 5162 1.7487
## 6 US 20536 2.0565
## 7 UA 58665 3.5045
## 8 9E 18460 6.9135
## 9 B6 54635 9.3565
## 10 WN 12275 9.4675
## .. ... ... ...
Overall Course Overview - Goal is to have data in a single, tidy table
However, real-world data is typically split across multiple tables; this course will be about handling that:
Builds on the above course about basic dplyr. More than one way to handle things, as is common in R; base::merge() has some similar functions, however:
Chapter 1 - Mutating Joins
Keys are the columns that are “matched” between datasets that are being joined:
Joins can be run in several manners:
Variations on joins - the left_join and right_join are “mutating joins”, which is to say that they return a copy of the “primary” data with columns added as appropriate:
Example code includes:
artFirst <- "Jimmy ; George ; Mick ; Tom ; Davy ; John ; Paul ; Jimmy ; Joe ; Elvis ; Keith ; Paul ; Ringo ; Joe ; Brian ; Nancy"
artLast <- "Buffett ; Harrison ; Jagger ; Jones ; Jones ; Lennon ; McCartney ; Page ; Perry ; Presley ; Richards ; Simon ; Starr ; Walsh ; Wilson ; Wilson"
artInstrument <- "Guitar ; Guitar ; Vocals ; Vocals ; Vocals ; Guitar ; Bass ; Guitar ; Guitar ; Vocals ; Guitar ; Guitar ; Drums ; Guitar ; Vocals ; Vocals"
bandFirst <- "John ; John Paul ; Jimmy ; Robert ; George ; John ; Paul ; Ringo ; Jimmy ; Mick ; Keith ; Charlie ; Ronnie"
bandLast <- "Bonham ; Jones ; Page ; Plant ; Harrison ; Lennon ; McCartney ; Starr ; Buffett ; Jagger ; Richards ; Watts ; Woods"
bandBand <- "Led Zeppelin ; Led Zeppelin ; Led Zeppelin ; Led Zeppelin ; The Beatles ; The Beatles ; The Beatles ; The Beatles ; The Coral Reefers ; The Rolling Stones ; The Rolling Stones ; The Rolling Stones ; The Rolling Stones"
artists <- data.frame( first=strsplit(artFirst, " ; ")[[1]] ,
last=strsplit(artLast, " ; ")[[1]] ,
instrument=strsplit(artInstrument, " ; ")[[1]] ,
stringsAsFactors=FALSE
)
bands <- data.frame( first=strsplit(bandFirst, " ; ")[[1]] ,
last=strsplit(bandLast, " ; ")[[1]] ,
band=strsplit(bandBand, " ; ")[[1]] ,
stringsAsFactors=FALSE
)
library(dplyr)
# Complete the code to join artists to bands
bands2 <- left_join(bands, artists, by = c("first", "last"))
# Examine the results
bands2
## first last band instrument
## 1 John Bonham Led Zeppelin <NA>
## 2 John Paul Jones Led Zeppelin <NA>
## 3 Jimmy Page Led Zeppelin Guitar
## 4 Robert Plant Led Zeppelin <NA>
## 5 George Harrison The Beatles Guitar
## 6 John Lennon The Beatles Guitar
## 7 Paul McCartney The Beatles Bass
## 8 Ringo Starr The Beatles Drums
## 9 Jimmy Buffett The Coral Reefers Guitar
## 10 Mick Jagger The Rolling Stones Vocals
## 11 Keith Richards The Rolling Stones Guitar
## 12 Charlie Watts The Rolling Stones <NA>
## 13 Ronnie Woods The Rolling Stones <NA>
# Note how this would be WRONG even though the code executes fine
left_join(bands, artists, by = c("first"))
## first last.x band last.y instrument
## 1 John Bonham Led Zeppelin Lennon Guitar
## 2 John Paul Jones Led Zeppelin <NA> <NA>
## 3 Jimmy Page Led Zeppelin Buffett Guitar
## 4 Jimmy Page Led Zeppelin Page Guitar
## 5 Robert Plant Led Zeppelin <NA> <NA>
## 6 George Harrison The Beatles Harrison Guitar
## 7 John Lennon The Beatles Lennon Guitar
## 8 Paul McCartney The Beatles McCartney Bass
## 9 Paul McCartney The Beatles Simon Guitar
## 10 Ringo Starr The Beatles Starr Drums
## 11 Jimmy Buffett The Coral Reefers Buffett Guitar
## 12 Jimmy Buffett The Coral Reefers Page Guitar
## 13 Mick Jagger The Rolling Stones Jagger Vocals
## 14 Keith Richards The Rolling Stones Richards Guitar
## 15 Charlie Watts The Rolling Stones <NA> <NA>
## 16 Ronnie Woods The Rolling Stones <NA> <NA>
# Finish the code below to recreate bands3 with a right join
bands2 <- left_join(bands, artists, by = c("first", "last"))
bands3 <- right_join(artists, bands, by = c("first", "last"))
# Check that bands3 is equal to bands2
setequal(bands2, bands3)
## TRUE
songData <- "Come Together : Abbey Road : John : Lennon ; Dream On : Aerosmith : Steven : Tyler ; Hello, Goodbye : Magical Mystery Tour : Paul : McCartney ; It's Not Unusual : Along Came Jones : Tom : Jones"
albumsData <- "A Hard Day's Night : The Beatles : 1964 ; Magical Mystery Tour : The Beatles : 1967 ; Beggar's Banquet : The Rolling Stones : 1968 ; Abbey Road : The Beatles : 1969 ; Led Zeppelin IV : Led Zeppelin : 1971 ; The Dark Side of the Moon : Pink Floyd : 1973 ; Aerosmith : Aerosmith : 1973 ; Rumours : Fleetwood Mac : 1977 ; Hotel California : Eagles : 1982"
songs <- as.data.frame( t(sapply(strsplit(songData, " ; ")[[1]],
FUN=function(x) { strsplit(x, " : ")[[1]] } ,
USE.NAMES=FALSE
)
) , stringsAsFactors=FALSE
)
albums <- as.data.frame( t(sapply(strsplit(albumsData, " ; ")[[1]],
FUN=function(x) { strsplit(x, " : ")[[1]] } ,
USE.NAMES=FALSE
))
, stringsAsFactors=FALSE
)
names(songs) <- c("song", "album", "first", "last")
names(albums) <- c("album", "band", "year")
# Join albums to songs using inner_join()
inner_join(songs, albums, by="album")
## song album first last band year
## 1 Come Together Abbey Road John Lennon The Beatles 1969
## 2 Dream On Aerosmith Steven Tyler Aerosmith 1973
## 3 Hello, Goodbye Magical Mystery Tour Paul McCartney The Beatles 1967
# Join bands to artists using full_join()
full_join(artists, bands, by=c("first", "last"))
## first last instrument band
## 1 Jimmy Buffett Guitar The Coral Reefers
## 2 George Harrison Guitar The Beatles
## 3 Mick Jagger Vocals The Rolling Stones
## 4 Tom Jones Vocals <NA>
## 5 Davy Jones Vocals <NA>
## 6 John Lennon Guitar The Beatles
## 7 Paul McCartney Bass The Beatles
## 8 Jimmy Page Guitar Led Zeppelin
## 9 Joe Perry Guitar <NA>
## 10 Elvis Presley Vocals <NA>
## 11 Keith Richards Guitar The Rolling Stones
## 12 Paul Simon Guitar <NA>
## 13 Ringo Starr Drums The Beatles
## 14 Joe Walsh Guitar <NA>
## 15 Brian Wilson Vocals <NA>
## 16 Nancy Wilson Vocals <NA>
## 17 John Bonham <NA> Led Zeppelin
## 18 John Paul Jones <NA> Led Zeppelin
## 19 Robert Plant <NA> Led Zeppelin
## 20 Charlie Watts <NA> The Rolling Stones
## 21 Ronnie Woods <NA> The Rolling Stones
# Find guitarists in bands dataset (don't change)
temp <- left_join(bands, artists, by = c("first", "last"))
temp <- filter(temp, instrument == "Guitar")
select(temp, first, last, band)
## first last band
## 1 Jimmy Page Led Zeppelin
## 2 George Harrison The Beatles
## 3 John Lennon The Beatles
## 4 Jimmy Buffett The Coral Reefers
## 5 Keith Richards The Rolling Stones
# Reproduce code above using pipes
bands %>%
left_join(artists, by = c("first", "last")) %>%
filter(instrument == "Guitar") %>%
select(first, last, band)
## first last band
## 1 Jimmy Page Led Zeppelin
## 2 George Harrison The Beatles
## 3 John Lennon The Beatles
## 4 Jimmy Buffett The Coral Reefers
## 5 Keith Richards The Rolling Stones
goalData <- "Tom : John : Paul ; Jones : Lennon : McCartney ; Vocals : Guitar : Bass ; NA : The Beatles : The Beatles ; It's Not Unusual : Come Together : Hello, Goodbye ; Along Came Jones : Abbey Road : Magical Mystery Tour"
goal <- as.data.frame( sapply(strsplit(goalData, " ; ")[[1]],
FUN=function(x) { strsplit(x, " : ")[[1]] } ,
USE.NAMES=FALSE
) , stringsAsFactors=FALSE
)
names(goal) <- c("first", "last", "instrument", "band", "song", "album")
goal[goal == "NA"] <- NA # Fix the text that is "NA"
# Examine the contents of the goal dataset
goal
## first last instrument band song
## 1 Tom Jones Vocals <NA> It's Not Unusual
## 2 John Lennon Guitar The Beatles Come Together
## 3 Paul McCartney Bass The Beatles Hello, Goodbye
## album
## 1 Along Came Jones
## 2 Abbey Road
## 3 Magical Mystery Tour
# Create goal2 using full_join() and inner_join()
goal2 <- artists %>%
full_join(bands, by=c("first", "last")) %>%
inner_join(songs, by=c("first", "last"))
# Check that goal and goal2 are the same
setequal(goal, goal2)
## TRUE
sum(goal != goal2, na.rm=TRUE)
## [1] 0
# Create one table that combines all information
artists %>%
full_join(bands, by=c("first", "last")) %>%
full_join(songs, by=c("first", "last")) %>%
full_join(albums, by=c("album", "band"))
## first last instrument band song
## 1 Jimmy Buffett Guitar The Coral Reefers <NA>
## 2 George Harrison Guitar The Beatles <NA>
## 3 Mick Jagger Vocals The Rolling Stones <NA>
## 4 Tom Jones Vocals <NA> It's Not Unusual
## 5 Davy Jones Vocals <NA> <NA>
## 6 John Lennon Guitar The Beatles Come Together
## 7 Paul McCartney Bass The Beatles Hello, Goodbye
## 8 Jimmy Page Guitar Led Zeppelin <NA>
## 9 Joe Perry Guitar <NA> <NA>
## 10 Elvis Presley Vocals <NA> <NA>
## 11 Keith Richards Guitar The Rolling Stones <NA>
## 12 Paul Simon Guitar <NA> <NA>
## 13 Ringo Starr Drums The Beatles <NA>
## 14 Joe Walsh Guitar <NA> <NA>
## 15 Brian Wilson Vocals <NA> <NA>
## 16 Nancy Wilson Vocals <NA> <NA>
## 17 John Bonham <NA> Led Zeppelin <NA>
## 18 John Paul Jones <NA> Led Zeppelin <NA>
## 19 Robert Plant <NA> Led Zeppelin <NA>
## 20 Charlie Watts <NA> The Rolling Stones <NA>
## 21 Ronnie Woods <NA> The Rolling Stones <NA>
## 22 Steven Tyler <NA> <NA> Dream On
## 23 <NA> <NA> <NA> The Beatles <NA>
## 24 <NA> <NA> <NA> The Rolling Stones <NA>
## 25 <NA> <NA> <NA> Led Zeppelin <NA>
## 26 <NA> <NA> <NA> Pink Floyd <NA>
## 27 <NA> <NA> <NA> Aerosmith <NA>
## 28 <NA> <NA> <NA> Fleetwood Mac <NA>
## 29 <NA> <NA> <NA> Eagles <NA>
## album year
## 1 <NA> <NA>
## 2 <NA> <NA>
## 3 <NA> <NA>
## 4 Along Came Jones <NA>
## 5 <NA> <NA>
## 6 Abbey Road 1969
## 7 Magical Mystery Tour 1967
## 8 <NA> <NA>
## 9 <NA> <NA>
## 10 <NA> <NA>
## 11 <NA> <NA>
## 12 <NA> <NA>
## 13 <NA> <NA>
## 14 <NA> <NA>
## 15 <NA> <NA>
## 16 <NA> <NA>
## 17 <NA> <NA>
## 18 <NA> <NA>
## 19 <NA> <NA>
## 20 <NA> <NA>
## 21 <NA> <NA>
## 22 Aerosmith <NA>
## 23 A Hard Day's Night 1964
## 24 Beggar's Banquet 1968
## 25 Led Zeppelin IV 1971
## 26 The Dark Side of the Moon 1973
## 27 Aerosmith 1973
## 28 Rumours 1977
## 29 Hotel California 1982
Chapter 2
Filtering joins return a copy of the primary data frame that has been filtered rather than augmented:
The anti_join() is the opposite of the semi_join() in that it keeps only rows that DO NOT have a match:
Set operations are used when two datasets contain the exact same variables:
Comparing datasets can also be run using setequal():
Example code includes:
# Data sets still available from the previous module
# View the output of semi_join()
artists %>%
semi_join(songs, by = c("first", "last"))
## first last instrument
## 1 John Lennon Guitar
## 2 Paul McCartney Bass
## 3 Tom Jones Vocals
# Create the same result
artists %>%
right_join(songs, by = c("first", "last")) %>%
filter(!is.na(instrument)) %>%
select(first, last, instrument)
## first last instrument
## 1 John Lennon Guitar
## 2 Paul McCartney Bass
## 3 Tom Jones Vocals
albums %>%
# Collect the albums made by a band
semi_join(bands, by="band") %>%
# Count the albums made by a band
nrow()
## [1] 5
# Create data set tracks and matches
trackTrack <- "Can't Buy Me Love ; I Feel Fine ; A Hard Day's Night ; Sound of Silence ; Help! ; Ticket to Ride ; I am a Rock ; Yellow Submarine / Eleanor Rigby ; Homeward Bound ; Scarborough Fair ; Penny Lane ; Strawberry Fields Forever ; Hello, Goodbye ; Ruby Tuesday ; All You Need Is Love ; Hey Jude ; Lady Madonna ; Get Back ; Sympathy for the Devil ; Brown Sugar ; Happy"
trackBand <- "The Beatles ; The Beatles ; The Beatles ; Simon and Garfunkel ; The Beatles ; The Beatles ; Simon and Garfunkel ; The Beatles ; Simon and Garfunkel ; Simon and Garfunkel ; The Beatles ; The Beatles ; The Beatles ; The Rolling Stones ; The Beatles ; The Beatles ; The Beatles ; The Beatles ; The Rolling Stones ; The Rolling Stones ; The Rolling Stones"
trackLabel <- "Parlophone ; Parlophone ; Parlophone ; Columbia ; Parlophone ; Parlophone ; Columbia ; Parlophone ; Columbia ; Columbia ; Parlophone ; Parlophone ; Parlophone ; Decca ; Parlophone ; Apple ; Parlophone ; Apple ; Decca ; Rolling Stones Records ; Rolling Stones Records"
trackYear <- "1964 ; 1964 ; 1964 ; 1964 ; 1965 ; 1965 ; 1965 ; 1966 ; 1966 ; 1966 ; 1967 ; 1967 ; 1967 ; 1967 ; 1967 ; 1968 ; 1968 ; 1969 ; 1969 ; 1971 ; 1972"
trackFirst <- "Paul ; John ; John ; Paul ; John ; John ; Paul ; Paul ; Paul ; unknown ; Paul ; John ; Paul ; Keith ; John ; Paul ; Paul ; Paul ; Mick ; Mick ; Keith"
trackLast <- "McCartney ; Lennon ; Lennon ; Simon ; Lennon ; Lennon ; Simon ; McCartney ; Simon ; unknown ; McCartney ; Lennon ; McCartney ; Richards ; Lennon ; McCartney ; McCartney ; McCartney ; Jagger ; Jagger ; Richards"
tracks <- data.frame(track=strsplit(trackTrack, " ; ")[[1]],
band=strsplit(trackBand, " ; ")[[1]],
label=strsplit(trackLabel, " ; ")[[1]],
year=as.integer(strsplit(trackYear, " ; ")[[1]]),
first=strsplit(trackFirst, " ; ")[[1]],
last=strsplit(trackLast, " ; ")[[1]],
stringsAsFactors = FALSE
)
matches <- data.frame(band=c("The Beatles", "The Beatles", "Simon and Garfunkel"),
year=c(1964L, 1965L, 1966L),
first=c("Paul", "John", "Paul"),
stringsAsFactors=FALSE
)
# Comparison of effort required
tracks %>% semi_join(
matches,
by = c("band", "year", "first")
)
## track band label year first last
## 1 Can't Buy Me Love The Beatles Parlophone 1964 Paul McCartney
## 2 Help! The Beatles Parlophone 1965 John Lennon
## 3 Ticket to Ride The Beatles Parlophone 1965 John Lennon
## 4 Homeward Bound Simon and Garfunkel Columbia 1966 Paul Simon
tracks %>% filter(
(band == "The Beatles" &
year == 1964 & first == "Paul") |
(band == "The Beatles" &
year == 1965 & first == "John") |
(band == "Simon and Garfunkel" &
year == 1966 & first == "Paul")
)
## track band label year first last
## 1 Can't Buy Me Love The Beatles Parlophone 1964 Paul McCartney
## 2 Help! The Beatles Parlophone 1965 John Lennon
## 3 Ticket to Ride The Beatles Parlophone 1965 John Lennon
## 4 Homeward Bound Simon and Garfunkel Columbia 1966 Paul Simon
# Return rows of artists that don't have bands info
artists %>%
anti_join(bands, by=c("first", "last"))
## first last instrument
## 1 Elvis Presley Vocals
## 2 Brian Wilson Vocals
## 3 Nancy Wilson Vocals
## 4 Tom Jones Vocals
## 5 Davy Jones Vocals
## 6 Paul Simon Guitar
## 7 Joe Walsh Guitar
## 8 Joe Perry Guitar
# Return rows of artists that don't have bands info
artists %>%
anti_join(bands, by=c("first", "last"))
## first last instrument
## 1 Elvis Presley Vocals
## 2 Brian Wilson Vocals
## 3 Nancy Wilson Vocals
## 4 Tom Jones Vocals
## 5 Davy Jones Vocals
## 6 Paul Simon Guitar
## 7 Joe Walsh Guitar
## 8 Joe Perry Guitar
albumMyLabel <- "Abbey Road ; A Hard Days Night ; Magical Mystery Tour ; Led Zeppelin IV ; The Dark Side of the Moon ; Hotel California ; Rumours ; Aerosmith ; Beggar's Banquet"
labelMyLabel <- "Apple ; Parlophone ; Parlophone ; Atlantic ; Harvest ; Asylum ; Warner Brothers ; Columbia ; Decca"
myLabels <- data.frame(album=strsplit(albumMyLabel, " ; ")[[1]],
label=strsplit(labelMyLabel, " ; ")[[1]],
stringsAsFactors=FALSE
)
# Check whether album names in labels are mis-entered
myLabels %>%
anti_join(albums, by="album")
## album label
## 1 A Hard Days Night Parlophone
# Determine which key joins labels and songs
myLabels
## album label
## 1 Abbey Road Apple
## 2 A Hard Days Night Parlophone
## 3 Magical Mystery Tour Parlophone
## 4 Led Zeppelin IV Atlantic
## 5 The Dark Side of the Moon Harvest
## 6 Hotel California Asylum
## 7 Rumours Warner Brothers
## 8 Aerosmith Columbia
## 9 Beggar's Banquet Decca
songs
## song album first last
## 1 Come Together Abbey Road John Lennon
## 2 Dream On Aerosmith Steven Tyler
## 3 Hello, Goodbye Magical Mystery Tour Paul McCartney
## 4 It's Not Unusual Along Came Jones Tom Jones
# Check your understanding
songs %>%
# Find the rows of songs that match a row in labels
semi_join(myLabels, by="album") %>%
# Number of matches between labels and songs
nrow()
## [1] 3
songAerosmith <- "Make It ; Somebody ; Dream On ; One Way Street ; Mama Kin ; Write me a Letter ; Moving Out ; Walking the Dog"
lengthAerosmith <- "13260 ; 13500 ; 16080 ; 25200 ; 15900 ; 15060 ; 18180 ; 11520"
songGreatestHits <- "Dream On ; Mama Kin ; Same Old Song and Dance ; Seasons of Winter ; Sweet Emotion ; Walk this Way ; Big Ten Inch Record ; Last Child ; Back in the Saddle ; Draw the Line ; Kings and Queens ; Come Together ; Remember (Walking in the Sand) ; Lightning Strikes ; Chip Away the Stone ; Sweet Emotion (remix) ; One Way Street (live)"
lengthGreatestHits <- "16080 ; 16020 ; 11040 ; 17820 ; 11700 ; 12780 ; 8100 ; 12480 ; 16860 ; 12240 ; 13680 ; 13620 ; 14700 ; 16080 ; 14460 ; 16560 ; 24000"
songLive <- "Back in the Saddle ; Sweet Emotion ; Lord of the Thighs ; Toys in the Attic ; Last Child ; Come Together ; Walk this Way ; Sick as a Dog ; Dream On ; Chip Away the Stone ; Sight for Sore Eyes ; Mama Kin ; S.O.S. (Too Bad) ; I Ain't Got You ; Mother Popcorn/Draw the Line ; Train Kept A-Rollin'/Strangers in the Night"
lengthLive <- "15900 ; 16920 ; 26280 ; 13500 ; 12240 ; 17460 ; 13560 ; 16920 ; 16260 ; 15120 ; 11880 ; 13380 ; 9960 ; 14220 ; 41700 ; 17460"
aerosmith <- data.frame(song=strsplit(songAerosmith, " ; ")[[1]],
length=as.integer(strsplit(lengthAerosmith, " ; ")[[1]]),
stringsAsFactors=FALSE
)
greatest_hits <- data.frame(song=strsplit(songGreatestHits, " ; ")[[1]],
length=as.integer(strsplit(lengthGreatestHits, " ; ")[[1]]),
stringsAsFactors=FALSE
)
myLive <- data.frame(song=strsplit(songLive, " ; ")[[1]],
length=as.integer(strsplit(lengthLive, " ; ")[[1]]),
stringsAsFactors=FALSE
)
aerosmith %>%
# Create the new dataset using a set operation
union(greatest_hits) %>%
# Count the total number of songs
nrow()
## [1] 24
# Create the new dataset using a set operation
aerosmith %>%
intersect(greatest_hits)
## song length
## 1 Dream On 16080
# Select the song names from live
live_songs <- myLive %>% select(song)
# Select the song names from greatest_hits
greatest_songs <- greatest_hits %>% select(song)
# Create the new dataset using a set operation
live_songs %>%
setdiff(greatest_songs)
## song
## 1 Lord of the Thighs
## 2 Toys in the Attic
## 3 Sick as a Dog
## 4 Sight for Sore Eyes
## 5 S.O.S. (Too Bad)
## 6 I Ain't Got You
## 7 Mother Popcorn/Draw the Line
## 8 Train Kept A-Rollin'/Strangers in the Night
# Select songs from live and greatest_hits
live_songs <- select(myLive, song)
greatest_songs <- select(greatest_hits, song)
# Return the songs that only exist in one dataset
union(setdiff(live_songs, greatest_songs), setdiff(greatest_songs, live_songs))
## song
## 1 Lord of the Thighs
## 2 Toys in the Attic
## 3 Sick as a Dog
## 4 Sight for Sore Eyes
## 5 S.O.S. (Too Bad)
## 6 I Ain't Got You
## 7 Mother Popcorn/Draw the Line
## 8 Train Kept A-Rollin'/Strangers in the Night
## 9 Same Old Song and Dance
## 10 Seasons of Winter
## 11 Big Ten Inch Record
## 12 Draw the Line
## 13 Kings and Queens
## 14 Remember (Walking in the Sand)
## 15 Lightning Strikes
## 16 Sweet Emotion (remix)
## 17 One Way Street (live)
# DO NOT HAVE DATA - NEED TO SKIP
# Check if same order: definitive and complete
# identical(definitive, complete)
# Check if any order: definitive and complete
# setequal(definitive, complete)
# Songs in definitive but not complete
# setdiff(definitive, complete)
# Songs in complete but not definitive
# setdiff(complete, definitive)
# Return songs in definitive that are not in complete
# definitive %>%
# anti_join(complete, by=c("song", "album"))
# Return songs in complete that are not in definitive
# complete %>%
# anti_join(definitive, by=c("song", "album"))
# Check if same order: definitive and union of complete and soundtrack
# identical(definitive, union(complete, soundtrack))
# Check if any order: definitive and union of complete and soundtrack
# setequal(definitive, union(complete, soundtrack))
Chapter 3 - Assembling Data
Binding is the process of either combining columns for datasets that have the same rows, or combining rows for datasets that have the same columns:
Building a better data frame - equivalents for data.frame and as.data.frame:
Working with data types - R typically behaves intuitively:
General coercion rules - more specific types of data will generally be converted to less specific types of data:
Example code includes:
songSideOne <- "Speak to Me ; Breathe ; On the Run ; Time ; The Great Gig in the Sky"
lengthSideOne <- "5400 ; 9780 ; 12600 ; 24780 ; 15300"
songSideTwo <-"Money ; Us and Them ; Any Colour You Like ; Brain Damage ; Eclipse"
lengthSideTwo <-"23400 ; 28260 ; 12240 ; 13800 ; 7380"
side_one <- data.frame(song=strsplit(songSideOne, " ; ")[[1]],
length=as.integer(strsplit(lengthSideOne, " ; ")[[1]]),
stringsAsFactors=FALSE
)
side_two <- data.frame(song=strsplit(songSideTwo, " ; ")[[1]],
length=as.integer(strsplit(lengthSideTwo, " ; ")[[1]]),
stringsAsFactors=FALSE
)
# Examine side_one and side_two
side_one
## song length
## 1 Speak to Me 5400
## 2 Breathe 9780
## 3 On the Run 12600
## 4 Time 24780
## 5 The Great Gig in the Sky 15300
side_two
## song length
## 1 Money 23400
## 2 Us and Them 28260
## 3 Any Colour You Like 12240
## 4 Brain Damage 13800
## 5 Eclipse 7380
# Bind side_one and side_two into a single dataset
side_one %>%
bind_rows(side_two)
## song length
## 1 Speak to Me 5400
## 2 Breathe 9780
## 3 On the Run 12600
## 4 Time 24780
## 5 The Great Gig in the Sky 15300
## 6 Money 23400
## 7 Us and Them 28260
## 8 Any Colour You Like 12240
## 9 Brain Damage 13800
## 10 Eclipse 7380
# Create shorter version of jimi
jimi <- list(data.frame(song=c("Purple Haze", "Hey Joe", "Fire"),
length=c(9960L, 12180L, 9240L),
stringsAsFactors=FALSE
),
data.frame(song=c("EXP", "Little Wing", "Little Miss Lover", "Bold as Love"),
length=c(6900L, 8640L, 8400L, 15060L),
stringsAsFactors=FALSE
),
data.frame(song=c("Voodoo Chile", "Gypsy Eyes"),
length=c(54000L, 13380L),
stringsAsFactors=FALSE
)
)
names(jimi) <- c("Are You Experienced", "Axis: Bold As Love", "Electric Ladyland")
discography <- data.frame(album=names(jimi),
year=c(1967L, 1967L, 1968L),
stringsAsFactors=FALSE
)
# Examine discography and jimi
discography
## album year
## 1 Are You Experienced 1967
## 2 Axis: Bold As Love 1967
## 3 Electric Ladyland 1968
jimi
## $`Are You Experienced`
## song length
## 1 Purple Haze 9960
## 2 Hey Joe 12180
## 3 Fire 9240
##
## $`Axis: Bold As Love`
## song length
## 1 EXP 6900
## 2 Little Wing 8640
## 3 Little Miss Lover 8400
## 4 Bold as Love 15060
##
## $`Electric Ladyland`
## song length
## 1 Voodoo Chile 54000
## 2 Gypsy Eyes 13380
jimi %>%
# Bind jimi into a single data frame
bind_rows(.id="album") %>%
# Make a complete data frame
left_join(discography, by="album")
## album song length year
## 1 Are You Experienced Purple Haze 9960 1967
## 2 Are You Experienced Hey Joe 12180 1967
## 3 Are You Experienced Fire 9240 1967
## 4 Axis: Bold As Love EXP 6900 1967
## 5 Axis: Bold As Love Little Wing 8640 1967
## 6 Axis: Bold As Love Little Miss Lover 8400 1967
## 7 Axis: Bold As Love Bold as Love 15060 1967
## 8 Electric Ladyland Voodoo Chile 54000 1968
## 9 Electric Ladyland Gypsy Eyes 13380 1968
# Create the hank data
songHankYears <- "Move It On Over ; My Love for You (Has Turned to Hate) ; Never Again (Will I Knock on Your Door) ; On the Banks of the Old Ponchartrain ; Pan American ; Wealth Won't Save Your Soul ; A Mansion on the Hill ; Honky Tonkin' ; I Saw the Light ; I'm a Long Gone Daddy ; My Sweet Love Ain't Around ; I'm So Lonesome I Could Cry ; Lost Highway ; Lovesick Blues ; Mind Your Own Business ; My Bucket's Got a Hole in It ; Never Again (Will I Knock on Your Door) ; Wedding Bells ; You're Gonna Change (Or I'm Gonna Leave) ; I Just Don't Like This Kind of Living ; Long Gone Lonesome Blues ; Moanin' the Blues ; My Son Calls Another Man Daddy ; Nobody's Lonesome for Me ; They'll Never Take Her Love from Me ; Why Don't You Love Me ; Why Should We Try Anymore ; (I Heard That) Lonesome Whistle ; Baby, We're Really in Love ; Cold, Cold Heart ; Crazy Heart ; Dear John ; Hey Good Lookin' ; Howlin' At the Moon ; I Can't Help It (If I'm Still in Love With You) ; Half as Much ; Honky Tonk Blues ; I'll Never Get Out of This World Alive ; Jambalaya (On the Bayou) ; Settin' the Woods on Fire ; You Win Again ; Calling You ; I Won't Be Home No More ; Kaw-Liga ; Take These Chains from My Heart ; Weary Blues from Waitin' ; Your Cheatin' Heart ; (I'm Gonna) Sing, Sing, Sing ; How Can You Refuse Him Now ; I'm Satisfied with You ; You Better Keep It on Your Mind ; A Teardrop on a Rose ; At the First Fall of Snow ; Mother Is Gone ; Please Don't Let Me Love You ; Thank God ; A Home in Heaven ; California Zephyr ; Singing Waterfall ; There's No Room in My Heart for the Blues ; Leave Me Alone with the Blues ; Ready to Go Home ; The Waltz of the Wind ; Just Waitin' ; The Pale Horse and His Rider ; Kaw-Liga ; There's a Tear in My Beer"
yearHankYears <- "1947 ; 1947 ; 1947 ; 1947 ; 1947 ; 1947 ; 1948 ; 1948 ; 1948 ; 1948 ; 1948 ; 1949 ; 1949 ; 1949 ; 1949 ; 1949 ; 1949 ; 1949 ; 1949 ; 1950 ; 1950 ; 1950 ; 1950 ; 1950 ; 1950 ; 1950 ; 1950 ; 1951 ; 1951 ; 1951 ; 1951 ; 1951 ; 1951 ; 1951 ; 1951 ; 1952 ; 1952 ; 1952 ; 1952 ; 1952 ; 1952 ; 1953 ; 1953 ; 1953 ; 1953 ; 1953 ; 1953 ; 1954 ; 1954 ; 1954 ; 1954 ; 1955 ; 1955 ; 1955 ; 1955 ; 1955 ; 1956 ; 1956 ; 1956 ; 1956 ; 1957 ; 1957 ; 1957 ; 1958 ; 1965 ; 1966 ; 1989"
songHankCharts <- "(I Heard That) Lonesome Whistle ; (I'm Gonna) Sing, Sing, Sing ; A Home in Heaven ; A Mansion on the Hill ; A Teardrop on a Rose ; At the First Fall of Snow ; Baby, We're Really in Love ; California Zephyr ; Calling You ; Cold, Cold Heart ; Crazy Heart ; Dear John ; Half as Much ; Hey Good Lookin' ; Honky Tonk Blues ; Honky Tonkin' ; How Can You Refuse Him Now ; Howlin' At the Moon ; I Can't Help It (If I'm Still in Love With You) ; I Just Don't Like This Kind of Living ; I Saw the Light ; I Won't Be Home No More ; I'll Never Get Out of This World Alive ; I'm a Long Gone Daddy ; I'm Satisfied with You ; I'm So Lonesome I Could Cry ; Jambalaya (On the Bayou) ; Just Waitin' ; Kaw-Liga ; Kaw-Liga ; Leave Me Alone with the Blues ; Long Gone Lonesome Blues ; Lost Highway ; Lovesick Blues ; Mind Your Own Business ; Moanin' the Blues ; Mother Is Gone ; Move It On Over ; My Bucket's Got a Hole in It ; My Love for You (Has Turned to Hate) ; My Son Calls Another Man Daddy ; My Sweet Love Ain't Around ; Never Again (Will I Knock on Your Door) ; Never Again (Will I Knock on Your Door) ; Nobody's Lonesome for Me ; On the Banks of the Old Ponchartrain ; Pan American ; Please Don't Let Me Love You ; Ready to Go Home ; Settin' the Woods on Fire ; Singing Waterfall ; Take These Chains from My Heart ; Thank God ; The Pale Horse and His Rider ; The Waltz of the Wind ; There's a Tear in My Beer ; There's No Room in My Heart for the Blues ; They'll Never Take Her Love from Me ; Wealth Won't Save Your Soul ; Weary Blues from Waitin' ; Wedding Bells ; Why Don't You Love Me ; Why Should We Try Anymore ; You Better Keep It on Your Mind ; You Win Again ; You're Gonna Change (Or I'm Gonna Leave) ; Your Cheatin' Heart"
peakHankCharts <- "9 ; NA ; NA ; 12 ; NA ; NA ; 4 ; NA ; NA ; 1 ; 4 ; 8 ; 2 ; 1 ; 2 ; 14 ; NA ; 3 ; 2 ; 5 ; NA ; 4 ; 1 ; 6 ; NA ; 2 ; 1 ; NA ; 1 ; NA ; NA ; 1 ; 12 ; 1 ; 5 ; 1 ; NA ; 4 ; 2 ; NA ; 9 ; NA ; NA ; 6 ; 9 ; NA ; NA ; 9 ; NA ; 2 ; NA ; 1 ; NA ; NA ; NA ; 7 ; NA ; 5 ; NA ; 7 ; 2 ; 1 ; 9 ; NA ; 10 ; 4 ; 1"
hank_years <- data.frame(year=as.integer(strsplit(yearHankYears, " ; ")[[1]]),
song=strsplit(songHankYears, " ; ")[[1]],
stringsAsFactors=FALSE
)
hank_charts <- data.frame(song=strsplit(songHankCharts, " ; ")[[1]],
peak=as.integer(strsplit(peakHankCharts, " ; ")[[1]]),
stringsAsFactors=FALSE
)
## Warning in data.frame(song = strsplit(songHankCharts, " ; ")[[1]], peak =
## as.integer(strsplit(peakHankCharts, : NAs introduced by coercion
# Examine hank_years and hank_charts
tibble::as_tibble(hank_years)
## # A tibble: 67 × 2
## year song
## <int> <chr>
## 1 1947 Move It On Over
## 2 1947 My Love for You (Has Turned to Hate)
## 3 1947 Never Again (Will I Knock on Your Door)
## 4 1947 On the Banks of the Old Ponchartrain
## 5 1947 Pan American
## 6 1947 Wealth Won't Save Your Soul
## 7 1948 A Mansion on the Hill
## 8 1948 Honky Tonkin'
## 9 1948 I Saw the Light
## 10 1948 I'm a Long Gone Daddy
## # ... with 57 more rows
tibble::as_tibble(hank_charts)
## # A tibble: 67 × 2
## song peak
## <chr> <int>
## 1 (I Heard That) Lonesome Whistle 9
## 2 (I'm Gonna) Sing, Sing, Sing NA
## 3 A Home in Heaven NA
## 4 A Mansion on the Hill 12
## 5 A Teardrop on a Rose NA
## 6 At the First Fall of Snow NA
## 7 Baby, We're Really in Love 4
## 8 California Zephyr NA
## 9 Calling You NA
## 10 Cold, Cold Heart 1
## # ... with 57 more rows
a <- hank_years %>%
# Reorder hank_years alphabetically by song title
arrange(song) %>%
# Select just the year column
select(year) %>%
# Bind the year column
bind_cols(hank_charts) %>%
# Arrange the finished dataset
arrange(year, song)
a # see the results
## year song peak
## 1 1947 Move It On Over 4
## 2 1947 My Love for You (Has Turned to Hate) NA
## 3 1947 Never Again (Will I Knock on Your Door) NA
## 4 1947 On the Banks of the Old Ponchartrain NA
## 5 1947 Pan American NA
## 6 1947 Wealth Won't Save Your Soul NA
## 7 1948 A Mansion on the Hill 12
## 8 1948 Honky Tonkin' 14
## 9 1948 I'm Satisfied with You NA
## 10 1948 I Just Don't Like This Kind of Living 5
## 11 1948 My Sweet Love Ain't Around NA
## 12 1949 I Won't Be Home No More 4
## 13 1949 Lost Highway 12
## 14 1949 Lovesick Blues 1
## 15 1949 Mind Your Own Business 5
## 16 1949 My Bucket's Got a Hole in It 2
## 17 1949 Never Again (Will I Knock on Your Door) 6
## 18 1949 Wedding Bells 2
## 19 1949 You Better Keep It on Your Mind NA
## 20 1950 I'm a Long Gone Daddy 6
## 21 1950 Long Gone Lonesome Blues 1
## 22 1950 Moanin' the Blues 1
## 23 1950 My Son Calls Another Man Daddy 9
## 24 1950 Nobody's Lonesome for Me 9
## 25 1950 They'll Never Take Her Love from Me 5
## 26 1950 Why Don't You Love Me 1
## 27 1950 Why Should We Try Anymore 9
## 28 1951 (I'm Gonna) Sing, Sing, Sing NA
## 29 1951 Baby, We're Really in Love 4
## 30 1951 Cold, Cold Heart 1
## 31 1951 Crazy Heart 4
## 32 1951 Dear John 8
## 33 1951 Hey Good Lookin' 1
## 34 1951 Howlin' At the Moon 3
## 35 1951 I'll Never Get Out of This World Alive 1
## 36 1952 Half as Much 2
## 37 1952 Honky Tonk Blues 2
## 38 1952 I Can't Help It (If I'm Still in Love With You) 2
## 39 1952 Jambalaya (On the Bayou) 1
## 40 1952 Settin' the Woods on Fire 2
## 41 1952 You're Gonna Change (Or I'm Gonna Leave) 4
## 42 1953 Calling You NA
## 43 1953 I'm So Lonesome I Could Cry 2
## 44 1953 Kaw-Liga 1
## 45 1953 Take These Chains from My Heart 1
## 46 1953 Weary Blues from Waitin' 7
## 47 1953 Your Cheatin' Heart 1
## 48 1954 (I Heard That) Lonesome Whistle 9
## 49 1954 How Can You Refuse Him Now NA
## 50 1954 I Saw the Light NA
## 51 1954 You Win Again 10
## 52 1955 A Teardrop on a Rose NA
## 53 1955 At the First Fall of Snow NA
## 54 1955 Mother Is Gone NA
## 55 1955 Please Don't Let Me Love You 9
## 56 1955 Thank God NA
## 57 1956 A Home in Heaven NA
## 58 1956 California Zephyr NA
## 59 1956 Singing Waterfall NA
## 60 1956 There's No Room in My Heart for the Blues NA
## 61 1957 Leave Me Alone with the Blues NA
## 62 1957 Ready to Go Home NA
## 63 1957 The Waltz of the Wind NA
## 64 1958 Just Waitin' NA
## 65 1965 The Pale Horse and His Rider NA
## 66 1966 Kaw-Liga NA
## 67 1989 There's a Tear in My Beer 7
hank_year <- a$year
hank_song <- a$song
hank_peak <- a$peak
# Make combined data frame using data_frame()
data_frame(year=hank_year, song=hank_song, peak=hank_peak) %>%
# Extract songs where peak equals 1
filter(peak == 1)
## # A tibble: 11 × 3
## year song peak
## <int> <chr> <int>
## 1 1949 Lovesick Blues 1
## 2 1950 Long Gone Lonesome Blues 1
## 3 1950 Moanin' the Blues 1
## 4 1950 Why Don't You Love Me 1
## 5 1951 Cold, Cold Heart 1
## 6 1951 Hey Good Lookin' 1
## 7 1951 I'll Never Get Out of This World Alive 1
## 8 1952 Jambalaya (On the Bayou) 1
## 9 1953 Kaw-Liga 1
## 10 1953 Take These Chains from My Heart 1
## 11 1953 Your Cheatin' Heart 1
hank <- list(year=hank_year, song=hank_song, peak=hank_peak)
# Examine the contents of hank
hank
## $year
## [1] 1947 1947 1947 1947 1947 1947 1948 1948 1948 1948 1948 1949 1949 1949
## [15] 1949 1949 1949 1949 1949 1950 1950 1950 1950 1950 1950 1950 1950 1951
## [29] 1951 1951 1951 1951 1951 1951 1951 1952 1952 1952 1952 1952 1952 1953
## [43] 1953 1953 1953 1953 1953 1954 1954 1954 1954 1955 1955 1955 1955 1955
## [57] 1956 1956 1956 1956 1957 1957 1957 1958 1965 1966 1989
##
## $song
## [1] "Move It On Over"
## [2] "My Love for You (Has Turned to Hate)"
## [3] "Never Again (Will I Knock on Your Door)"
## [4] "On the Banks of the Old Ponchartrain"
## [5] "Pan American"
## [6] "Wealth Won't Save Your Soul"
## [7] "A Mansion on the Hill"
## [8] "Honky Tonkin'"
## [9] "I'm Satisfied with You"
## [10] "I Just Don't Like This Kind of Living"
## [11] "My Sweet Love Ain't Around"
## [12] "I Won't Be Home No More"
## [13] "Lost Highway"
## [14] "Lovesick Blues"
## [15] "Mind Your Own Business"
## [16] "My Bucket's Got a Hole in It"
## [17] "Never Again (Will I Knock on Your Door)"
## [18] "Wedding Bells"
## [19] "You Better Keep It on Your Mind"
## [20] "I'm a Long Gone Daddy"
## [21] "Long Gone Lonesome Blues"
## [22] "Moanin' the Blues"
## [23] "My Son Calls Another Man Daddy"
## [24] "Nobody's Lonesome for Me"
## [25] "They'll Never Take Her Love from Me"
## [26] "Why Don't You Love Me"
## [27] "Why Should We Try Anymore"
## [28] "(I'm Gonna) Sing, Sing, Sing"
## [29] "Baby, We're Really in Love"
## [30] "Cold, Cold Heart"
## [31] "Crazy Heart"
## [32] "Dear John"
## [33] "Hey Good Lookin'"
## [34] "Howlin' At the Moon"
## [35] "I'll Never Get Out of This World Alive"
## [36] "Half as Much"
## [37] "Honky Tonk Blues"
## [38] "I Can't Help It (If I'm Still in Love With You)"
## [39] "Jambalaya (On the Bayou)"
## [40] "Settin' the Woods on Fire"
## [41] "You're Gonna Change (Or I'm Gonna Leave)"
## [42] "Calling You"
## [43] "I'm So Lonesome I Could Cry"
## [44] "Kaw-Liga"
## [45] "Take These Chains from My Heart"
## [46] "Weary Blues from Waitin'"
## [47] "Your Cheatin' Heart"
## [48] "(I Heard That) Lonesome Whistle"
## [49] "How Can You Refuse Him Now"
## [50] "I Saw the Light"
## [51] "You Win Again"
## [52] "A Teardrop on a Rose"
## [53] "At the First Fall of Snow"
## [54] "Mother Is Gone"
## [55] "Please Don't Let Me Love You"
## [56] "Thank God"
## [57] "A Home in Heaven"
## [58] "California Zephyr"
## [59] "Singing Waterfall"
## [60] "There's No Room in My Heart for the Blues"
## [61] "Leave Me Alone with the Blues"
## [62] "Ready to Go Home"
## [63] "The Waltz of the Wind"
## [64] "Just Waitin'"
## [65] "The Pale Horse and His Rider"
## [66] "Kaw-Liga"
## [67] "There's a Tear in My Beer"
##
## $peak
## [1] 4 NA NA NA NA NA 12 14 NA 5 NA 4 12 1 5 2 6 2 NA 6 1 1 9
## [24] 9 5 1 9 NA 4 1 4 8 1 3 1 2 2 2 1 2 4 NA 2 1 1 7
## [47] 1 9 NA NA 10 NA NA NA 9 NA NA NA NA NA NA NA NA NA NA NA 7
# Convert the hank list into a data frame
as_data_frame(hank) %>%
# Extract songs where peak equals 1
filter(peak == 1)
## # A tibble: 11 × 3
## year song peak
## <int> <chr> <int>
## 1 1949 Lovesick Blues 1
## 2 1950 Long Gone Lonesome Blues 1
## 3 1950 Moanin' the Blues 1
## 4 1950 Why Don't You Love Me 1
## 5 1951 Cold, Cold Heart 1
## 6 1951 Hey Good Lookin' 1
## 7 1951 I'll Never Get Out of This World Alive 1
## 8 1952 Jambalaya (On the Bayou) 1
## 9 1953 Kaw-Liga 1
## 10 1953 Take These Chains from My Heart 1
## 11 1953 Your Cheatin' Heart 1
### ** DO NOT RUN DUE TO NOT HAVING DATASET
# Examine the contents of michael
# michael
# bind_rows(michael, .id="album") %>%
# group_by(album) %>%
# mutate(rank = min_rank(peak)) %>%
# filter(rank == 1) %>%
# select(-rank, -peak)
y <- factor(c(5, 6, 7, 6))
y
## [1] 5 6 7 6
## Levels: 5 6 7
unclass(y)
## [1] 1 2 3 2
## attr(,"levels")
## [1] "5" "6" "7"
as.character(y)
## [1] "5" "6" "7" "6"
as.numeric(y)
## [1] 1 2 3 2
as.numeric(as.character(y))
## [1] 5 6 7 6
### ** DO NOT RUN DUE TO NOT HAVING DATASET
# seventies %>%
# Coerce seventies$year into a useful numeric
# mutate(year = as.numeric(as.character(year))) %>%
# Bind the updated version of seventies to sixties
# bind_rows(sixties) %>%
# arrange(year)
Chapter 4 - Advanced Joining
What can go wrong? General issues can be considered as a 2x2 matrix, where key values and/or key columns can be either missing and/or duplicated:
Defining the keys - expanding on the previous approaches that have always used by= explicitly in the join function:
Joining multiple tables is an extension of joining two tables:
Other implementations can be available:
Example code includes:
stage_songs <- data.frame(musical=c("Into the Woods", "West Side Story",
"Cats", "Phantom of the Opera"
),
year=c(1986L, 1957L, 1981L, 1986L),
stringsAsFactors=FALSE
)
rownames(stage_songs) <- c("Children Will Listen", "Maria",
"Memory", "The Music of the Night"
)
stage_writers <- data.frame(song=rownames(stage_songs),
composer=c("Stephen Sondheim", "Louis Bernstein",
"Andrew Lloyd Webber", "Andrew Lloyd Webber"
),
stringsAsFactors=FALSE
)
stage_songs %>%
# Add row names as a column named song
tibble::rownames_to_column(var="song") %>%
# Left join stage_writers to stage_songs
left_join(stage_writers, by="song")
## song musical year composer
## 1 Children Will Listen Into the Woods 1986 Stephen Sondheim
## 2 Maria West Side Story 1957 Louis Bernstein
## 3 Memory Cats 1981 Andrew Lloyd Webber
## 4 The Music of the Night Phantom of the Opera 1986 Andrew Lloyd Webber
singers <- data.frame(movie=c(NA, "The Sound of Music"),
singer=c("Arnold Schwarzenegger", "Julie Andrews"),
stringsAsFactors=FALSE
)
two_songs <- data.frame(movie=c("The Sound of Music", NA),
song=c("Do-Re-Mi", "A Spoonful of Sugar"),
stringsAsFactors=FALSE
)
# Examine the result of joining singers to two_songs
two_songs %>% inner_join(singers, by = "movie")
## movie song singer
## 1 The Sound of Music Do-Re-Mi Julie Andrews
## 2 <NA> A Spoonful of Sugar Arnold Schwarzenegger
# Remove NA's from key before joining
two_songs %>%
filter(!is.na(movie)) %>%
inner_join(singers, by = "movie")
## movie song singer
## 1 The Sound of Music Do-Re-Mi Julie Andrews
movieMovieYears <- "The Road to Morocco ; Going My Way ; Anchors Aweigh ; Till the Clouds Roll By ; White Christmas ; The Tender Trap ; High Society ; The Joker is Wild ; Pal Joey ; Can-Can"
nameMovieYears <- "Bing Crosby ; Bing Crosby ; Frank Sinatra ; Frank Sinatra ; Bing Crosby ; Frank Sinatra ; Bing Crosby ; Frank Sinatra ; Frank Sinatra ; Frank Sinatra"
yearMovieYears <- "1942 ; 1944 ; 1945 ; 1946 ; 1954 ; 1955 ; 1956 ; 1957 ; 1957 ; 1960"
movieMovieStudios <- "The Road to Morocco ; Going My Way ; Anchors Aweigh ; Till the Clouds Roll By ; White Christmas ; The Tender Trap ; High Society ; The Joker is Wild ; Pal Joey ; Can-Can"
nameMovieStudios <- "Paramount Pictures ; Paramount Pictures ; Metro-Goldwyn-Mayer ; Metro-Goldwyn-Mayer ; Paramount Pictures ; Metro-Goldwyn-Mayer ; Metro-Goldwyn-Mayer ; Paramount Pictures ; Columbia Pictures ; Twentieth-Century Fox"
movie_years <- data.frame(movie=strsplit(movieMovieYears, " ; ")[[1]],
name=strsplit(nameMovieYears, " ; ")[[1]],
year=as.integer(strsplit(yearMovieYears, " ; ")[[1]]),
stringsAsFactors=FALSE
)
movie_studios <- data.frame(movie=strsplit(movieMovieStudios, " ; ")[[1]],
name=strsplit(nameMovieStudios, " ; ")[[1]],
stringsAsFactors=FALSE
)
movie_years %>%
# Left join movie_studios to movie_years
left_join(movie_studios, by="movie") %>%
# Rename the columns: artist and studio
rename(artist=name.x, studio=name.y)
## movie artist year studio
## 1 The Road to Morocco Bing Crosby 1942 Paramount Pictures
## 2 Going My Way Bing Crosby 1944 Paramount Pictures
## 3 Anchors Aweigh Frank Sinatra 1945 Metro-Goldwyn-Mayer
## 4 Till the Clouds Roll By Frank Sinatra 1946 Metro-Goldwyn-Mayer
## 5 White Christmas Bing Crosby 1954 Paramount Pictures
## 6 The Tender Trap Frank Sinatra 1955 Metro-Goldwyn-Mayer
## 7 High Society Bing Crosby 1956 Metro-Goldwyn-Mayer
## 8 The Joker is Wild Frank Sinatra 1957 Paramount Pictures
## 9 Pal Joey Frank Sinatra 1957 Columbia Pictures
## 10 Can-Can Frank Sinatra 1960 Twentieth-Century Fox
elvis_movies <- data.frame(name=c("Jailhouse Rock", "Blue Hawaii",
"Viva Las Vegas", "Clambake"
),
year=c(1957L, 1961L, 1963L, 1967L),
stringsAsFactors=FALSE
)
elvTemp <- "(You're So Square) Baby I Don't Care ; I Can't Help Falling in Love ; Jailhouse Rock ; Viva Las Vegas ; You Don't Know Me"
elvis_songs <- data.frame(name=strsplit(elvTemp, " ; ")[[1]],
movie=elvis_movies$name[c(1, 2, 1, 3, 4)],
stringsAsFactors=FALSE
)
# Identify the key column
elvis_songs
## name movie
## 1 (You're So Square) Baby I Don't Care Jailhouse Rock
## 2 I Can't Help Falling in Love Blue Hawaii
## 3 Jailhouse Rock Jailhouse Rock
## 4 Viva Las Vegas Viva Las Vegas
## 5 You Don't Know Me Clambake
elvis_movies
## name year
## 1 Jailhouse Rock 1957
## 2 Blue Hawaii 1961
## 3 Viva Las Vegas 1963
## 4 Clambake 1967
elvis_movies %>%
# Left join elvis_songs to elvis_movies by this column
left_join(elvis_songs, by=c("name"="movie")) %>%
# Rename columns
rename(movie=name, song=name.y)
## movie year song
## 1 Jailhouse Rock 1957 (You're So Square) Baby I Don't Care
## 2 Jailhouse Rock 1957 Jailhouse Rock
## 3 Blue Hawaii 1961 I Can't Help Falling in Love
## 4 Viva Las Vegas 1963 Viva Las Vegas
## 5 Clambake 1967 You Don't Know Me
mdData <- "Anchors Aweigh ; Can-Can ; Going My Way ; High Society ; Pal Joey ; The Joker is Wild ; The Road to Morocco ; The Tender Trap ; Till the Clouds Roll By ; White Christmas : George Sidney ; Walter Lang ; Leo McCarey ; Charles Walters ; George Sidney ; Charles Vidor ; David Butler ; Charles Walters ; Richard Whorf ; Michael Curtiz : Metro-Goldwyn-Mayer ; Twentieth-Century Fox ; Paramount Pictures ; Metro-Goldwyn-Mayer ; Columbia Pictures ; Paramount Pictures ; Paramount Pictures ; Metro-Goldwyn-Mayer ; Metro-Goldwyn-Mayer ; Paramount Pictures"
myData <- "The Road to Morocco ; Going My Way ; Anchors Aweigh ; Till the Clouds Roll By ; White Christmas ; The Tender Trap ; High Society ; The Joker is Wild ; Pal Joey ; Can-Can : Bing Crosby ; Bing Crosby ; Frank Sinatra ; Frank Sinatra ; Bing Crosby ; Frank Sinatra ; Bing Crosby ; Frank Sinatra ; Frank Sinatra ; Frank Sinatra : 1942 ; 1944 ; 1945 ; 1946 ; 1954 ; 1955 ; 1956 ; 1957 ; 1957 ; 1960"
movie_directors <- as.data.frame(lapply(strsplit(mdData, " : "),
FUN=function(x) { strsplit(x, " ; ") }
),
stringsAsFactors=FALSE
)
names(movie_directors) <- c("name", "director", "studio")
movie_years <- as.data.frame(lapply(strsplit(myData, " : "),
FUN=function(x) { strsplit(x, " ; ") }
),
stringsAsFactors=FALSE
)
names(movie_years) <- c("movie", "name", "year")
movie_years$year <- as.integer(movie_years$year)
# Identify the key columns
movie_directors
## name director studio
## 1 Anchors Aweigh George Sidney Metro-Goldwyn-Mayer
## 2 Can-Can Walter Lang Twentieth-Century Fox
## 3 Going My Way Leo McCarey Paramount Pictures
## 4 High Society Charles Walters Metro-Goldwyn-Mayer
## 5 Pal Joey George Sidney Columbia Pictures
## 6 The Joker is Wild Charles Vidor Paramount Pictures
## 7 The Road to Morocco David Butler Paramount Pictures
## 8 The Tender Trap Charles Walters Metro-Goldwyn-Mayer
## 9 Till the Clouds Roll By Richard Whorf Metro-Goldwyn-Mayer
## 10 White Christmas Michael Curtiz Paramount Pictures
movie_years
## movie name year
## 1 The Road to Morocco Bing Crosby 1942
## 2 Going My Way Bing Crosby 1944
## 3 Anchors Aweigh Frank Sinatra 1945
## 4 Till the Clouds Roll By Frank Sinatra 1946
## 5 White Christmas Bing Crosby 1954
## 6 The Tender Trap Frank Sinatra 1955
## 7 High Society Bing Crosby 1956
## 8 The Joker is Wild Frank Sinatra 1957
## 9 Pal Joey Frank Sinatra 1957
## 10 Can-Can Frank Sinatra 1960
movie_years %>%
# Left join movie_directors to movie_years
left_join(movie_directors, by=c("movie"="name")) %>%
# Arrange the columns using select()
rename(artist=name) %>%
select(year, movie, artist, director, studio)
## year movie artist director
## 1 1942 The Road to Morocco Bing Crosby David Butler
## 2 1944 Going My Way Bing Crosby Leo McCarey
## 3 1945 Anchors Aweigh Frank Sinatra George Sidney
## 4 1946 Till the Clouds Roll By Frank Sinatra Richard Whorf
## 5 1954 White Christmas Bing Crosby Michael Curtiz
## 6 1955 The Tender Trap Frank Sinatra Charles Walters
## 7 1956 High Society Bing Crosby Charles Walters
## 8 1957 The Joker is Wild Frank Sinatra Charles Vidor
## 9 1957 Pal Joey Frank Sinatra George Sidney
## 10 1960 Can-Can Frank Sinatra Walter Lang
## studio
## 1 Paramount Pictures
## 2 Paramount Pictures
## 3 Metro-Goldwyn-Mayer
## 4 Metro-Goldwyn-Mayer
## 5 Paramount Pictures
## 6 Metro-Goldwyn-Mayer
## 7 Metro-Goldwyn-Mayer
## 8 Paramount Pictures
## 9 Columbia Pictures
## 10 Twentieth-Century Fox
### *** DO NOT RUN DUE TO NOT HAVING DATA
# Place supergroups, more_bands, and more_artists into a list
# list(supergroups, more_bands, more_artists) %>%
# Use reduce to join together the contents of the list
# purrr::reduce(left_join, by=c("first", "last"))
# list(more_artists, more_bands, supergroups) %>%
# Return rows of more_artists in all three datasets
# purrr::reduce(semi_join, by=c("first", "last"))
# Data is available from previous
# Alter the code to perform the join with a dplyr function
merge(bands, artists, by = c("first", "last"), all.x = TRUE) %>%
arrange(band)
## first last band instrument
## 1 Jimmy Page Led Zeppelin Guitar
## 2 John Bonham Led Zeppelin <NA>
## 3 John Paul Jones Led Zeppelin <NA>
## 4 Robert Plant Led Zeppelin <NA>
## 5 George Harrison The Beatles Guitar
## 6 John Lennon The Beatles Guitar
## 7 Paul McCartney The Beatles Bass
## 8 Ringo Starr The Beatles Drums
## 9 Jimmy Buffett The Coral Reefers Guitar
## 10 Charlie Watts The Rolling Stones <NA>
## 11 Keith Richards The Rolling Stones Guitar
## 12 Mick Jagger The Rolling Stones Vocals
## 13 Ronnie Woods The Rolling Stones <NA>
bands %>%
left_join(artists, by=c("first", "last"))
## first last band instrument
## 1 John Bonham Led Zeppelin <NA>
## 2 John Paul Jones Led Zeppelin <NA>
## 3 Jimmy Page Led Zeppelin Guitar
## 4 Robert Plant Led Zeppelin <NA>
## 5 George Harrison The Beatles Guitar
## 6 John Lennon The Beatles Guitar
## 7 Paul McCartney The Beatles Bass
## 8 Ringo Starr The Beatles Drums
## 9 Jimmy Buffett The Coral Reefers Guitar
## 10 Mick Jagger The Rolling Stones Vocals
## 11 Keith Richards The Rolling Stones Guitar
## 12 Charlie Watts The Rolling Stones <NA>
## 13 Ronnie Woods The Rolling Stones <NA>
Chapter 5 - Case Study
Lahman’s Baseball Database - the Sean Lahman package containing 26 tables, accessed through library(Lahman):
Using the “Salaries” data:
The dataset “HallOfFame” contains the votes and inductions by player:
Example code includes:
library(Lahman)
## Warning: package 'Lahman' was built under R version 3.2.5
# This will be missing battingLabels, fieldingLabels, pitchingLabels, LahmanData
lahmanNames <- lapply(LahmanData[, "file"],
FUN=function(x) {
data.frame(var=names(get(x)), stringsAsFactors=FALSE)
}
)
names(lahmanNames) <- LahmanData$file
# Examine lahmanNames
lahmanNames
## $AllstarFull
## var
## 1 playerID
## 2 yearID
## 3 gameNum
## 4 gameID
## 5 teamID
## 6 lgID
## 7 GP
## 8 startingPos
##
## $Appearances
## var
## 1 yearID
## 2 teamID
## 3 lgID
## 4 playerID
## 5 G_all
## 6 GS
## 7 G_batting
## 8 G_defense
## 9 G_p
## 10 G_c
## 11 G_1b
## 12 G_2b
## 13 G_3b
## 14 G_ss
## 15 G_lf
## 16 G_cf
## 17 G_rf
## 18 G_of
## 19 G_dh
## 20 G_ph
## 21 G_pr
##
## $AwardsManagers
## var
## 1 playerID
## 2 awardID
## 3 yearID
## 4 lgID
## 5 tie
## 6 notes
##
## $AwardsPlayers
## var
## 1 playerID
## 2 awardID
## 3 yearID
## 4 lgID
## 5 tie
## 6 notes
##
## $AwardsShareManagers
## var
## 1 awardID
## 2 yearID
## 3 lgID
## 4 playerID
## 5 pointsWon
## 6 pointsMax
## 7 votesFirst
##
## $AwardsSharePlayers
## var
## 1 awardID
## 2 yearID
## 3 lgID
## 4 playerID
## 5 pointsWon
## 6 pointsMax
## 7 votesFirst
##
## $Batting
## var
## 1 playerID
## 2 yearID
## 3 stint
## 4 teamID
## 5 lgID
## 6 G
## 7 AB
## 8 R
## 9 H
## 10 X2B
## 11 X3B
## 12 HR
## 13 RBI
## 14 SB
## 15 CS
## 16 BB
## 17 SO
## 18 IBB
## 19 HBP
## 20 SH
## 21 SF
## 22 GIDP
##
## $BattingPost
## var
## 1 yearID
## 2 round
## 3 playerID
## 4 teamID
## 5 lgID
## 6 G
## 7 AB
## 8 R
## 9 H
## 10 X2B
## 11 X3B
## 12 HR
## 13 RBI
## 14 SB
## 15 CS
## 16 BB
## 17 SO
## 18 IBB
## 19 HBP
## 20 SH
## 21 SF
## 22 GIDP
##
## $CollegePlaying
## var
## 1 playerID
## 2 schoolID
## 3 yearID
##
## $Fielding
## var
## 1 playerID
## 2 yearID
## 3 stint
## 4 teamID
## 5 lgID
## 6 POS
## 7 G
## 8 GS
## 9 InnOuts
## 10 PO
## 11 A
## 12 E
## 13 DP
## 14 PB
## 15 WP
## 16 SB
## 17 CS
## 18 ZR
##
## $FieldingOF
## var
## 1 playerID
## 2 yearID
## 3 stint
## 4 Glf
## 5 Gcf
## 6 Grf
##
## $FieldingPost
## var
## 1 playerID
## 2 yearID
## 3 teamID
## 4 lgID
## 5 round
## 6 POS
## 7 G
## 8 GS
## 9 InnOuts
## 10 PO
## 11 A
## 12 E
## 13 DP
## 14 TP
## 15 PB
## 16 SB
## 17 CS
##
## $HallOfFame
## var
## 1 playerID
## 2 yearID
## 3 votedBy
## 4 ballots
## 5 needed
## 6 votes
## 7 inducted
## 8 category
## 9 needed_note
##
## $Managers
## var
## 1 playerID
## 2 yearID
## 3 teamID
## 4 lgID
## 5 inseason
## 6 G
## 7 W
## 8 L
## 9 rank
## 10 plyrMgr
##
## $ManagersHalf
## var
## 1 playerID
## 2 yearID
## 3 teamID
## 4 lgID
## 5 inseason
## 6 half
## 7 G
## 8 W
## 9 L
## 10 rank
##
## $Master
## var
## 1 playerID
## 2 birthYear
## 3 birthMonth
## 4 birthDay
## 5 birthCountry
## 6 birthState
## 7 birthCity
## 8 deathYear
## 9 deathMonth
## 10 deathDay
## 11 deathCountry
## 12 deathState
## 13 deathCity
## 14 nameFirst
## 15 nameLast
## 16 nameGiven
## 17 weight
## 18 height
## 19 bats
## 20 throws
## 21 debut
## 22 finalGame
## 23 retroID
## 24 bbrefID
## 25 deathDate
## 26 birthDate
##
## $Pitching
## var
## 1 playerID
## 2 yearID
## 3 stint
## 4 teamID
## 5 lgID
## 6 W
## 7 L
## 8 G
## 9 GS
## 10 CG
## 11 SHO
## 12 SV
## 13 IPouts
## 14 H
## 15 ER
## 16 HR
## 17 BB
## 18 SO
## 19 BAOpp
## 20 ERA
## 21 IBB
## 22 WP
## 23 HBP
## 24 BK
## 25 BFP
## 26 GF
## 27 R
## 28 SH
## 29 SF
## 30 GIDP
##
## $PitchingPost
## var
## 1 playerID
## 2 yearID
## 3 round
## 4 teamID
## 5 lgID
## 6 W
## 7 L
## 8 G
## 9 GS
## 10 CG
## 11 SHO
## 12 SV
## 13 IPouts
## 14 H
## 15 ER
## 16 HR
## 17 BB
## 18 SO
## 19 BAOpp
## 20 ERA
## 21 IBB
## 22 WP
## 23 HBP
## 24 BK
## 25 BFP
## 26 GF
## 27 R
## 28 SH
## 29 SF
## 30 GIDP
##
## $Salaries
## var
## 1 yearID
## 2 teamID
## 3 lgID
## 4 playerID
## 5 salary
##
## $Schools
## var
## 1 schoolID
## 2 name_full
## 3 city
## 4 state
## 5 country
##
## $SeriesPost
## var
## 1 yearID
## 2 round
## 3 teamIDwinner
## 4 lgIDwinner
## 5 teamIDloser
## 6 lgIDloser
## 7 wins
## 8 losses
## 9 ties
##
## $Teams
## var
## 1 yearID
## 2 lgID
## 3 teamID
## 4 franchID
## 5 divID
## 6 Rank
## 7 G
## 8 Ghome
## 9 W
## 10 L
## 11 DivWin
## 12 WCWin
## 13 LgWin
## 14 WSWin
## 15 R
## 16 AB
## 17 H
## 18 X2B
## 19 X3B
## 20 HR
## 21 BB
## 22 SO
## 23 SB
## 24 CS
## 25 HBP
## 26 SF
## 27 RA
## 28 ER
## 29 ERA
## 30 CG
## 31 SHO
## 32 SV
## 33 IPouts
## 34 HA
## 35 HRA
## 36 BBA
## 37 SOA
## 38 E
## 39 DP
## 40 FP
## 41 name
## 42 park
## 43 attendance
## 44 BPF
## 45 PPF
## 46 teamIDBR
## 47 teamIDlahman45
## 48 teamIDretro
##
## $TeamsFranchises
## var
## 1 franchID
## 2 franchName
## 3 active
## 4 NAassoc
##
## $TeamsHalf
## var
## 1 yearID
## 2 lgID
## 3 teamID
## 4 Half
## 5 divID
## 6 DivWin
## 7 Rank
## 8 G
## 9 W
## 10 L
# Find variables in common
purrr::reduce(lahmanNames, intersect)
## [1] var
## <0 rows> (or 0-length row.names)
lahmanNames %>%
# Bind the data frames in lahmanNames
bind_rows(.id="dataframe") %>%
# Group the result by var
group_by(var) %>%
# Tally the number of appearances
tally() %>%
# Filter the data
filter(n > 1) %>%
# Arrange the results
arrange(-n)
## # A tibble: 57 × 2
## var n
## <chr> <int>
## 1 yearID 21
## 2 playerID 19
## 3 lgID 17
## 4 teamID 13
## 5 G 10
## 6 L 6
## 7 W 6
## 8 BB 5
## 9 CS 5
## 10 GS 5
## # ... with 47 more rows
lahmanNames %>%
# Bind the data frames
bind_rows(.id="dataframe") %>%
# Filter the results
filter(var=="playerID") %>%
# Extract the dataframe variable
`$`(dataframe)
## [1] "AllstarFull" "Appearances" "AwardsManagers"
## [4] "AwardsPlayers" "AwardsShareManagers" "AwardsSharePlayers"
## [7] "Batting" "BattingPost" "CollegePlaying"
## [10] "Fielding" "FieldingOF" "FieldingPost"
## [13] "HallOfFame" "Managers" "ManagersHalf"
## [16] "Master" "Pitching" "PitchingPost"
## [19] "Salaries"
players <- Master %>%
# Return the columns playerID, nameFirst and nameLast
select(playerID, nameFirst, nameLast) %>%
# Return one row for each distinct player
distinct()
players %>%
# Find all players who do not appear in Salaries
anti_join(Salaries, by="playerID") %>%
# Count them
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 13888
players %>%
anti_join(Salaries, by = "playerID") %>%
# How many unsalaried players appear in Appearances?
semi_join(Appearances, by="playerID") %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 13695
players %>%
# Find all players who do not appear in Salaries
anti_join(Salaries, by="playerID") %>%
# Join them to Appearances
left_join(Appearances, by="playerID") %>%
# Calculate total_games for each player
group_by(playerID) %>%
summarize(total_games=sum(G_all, na.rm=TRUE)) %>%
# Arrange in descending order by total_games
arrange(-total_games)
## # A tibble: 13,888 × 2
## playerID total_games
## <chr> <int>
## 1 yastrca01 3308
## 2 aaronha01 3298
## 3 cobbty01 3034
## 4 musiast01 3026
## 5 mayswi01 2992
## 6 robinbr01 2896
## 7 kalinal01 2834
## 8 collied01 2826
## 9 robinfr02 2808
## 10 wagneho01 2794
## # ... with 13,878 more rows
players %>%
# Find unsalaried players
anti_join(Salaries, by="playerID") %>%
# Join Batting to the unsalaried players
left_join(Batting, by="playerID") %>%
# Group by player
group_by(playerID) %>%
# Sum at-bats for each player
summarize(total_games=sum(AB, na.rm=TRUE)) %>%
# Arrange in descending order
arrange(-total_games)
## # A tibble: 13,888 × 2
## playerID total_games
## <chr> <int>
## 1 aaronha01 12364
## 2 yastrca01 11988
## 3 cobbty01 11434
## 4 musiast01 10972
## 5 mayswi01 10881
## 6 robinbr01 10654
## 7 wagneho01 10430
## 8 brocklo01 10332
## 9 ansonca01 10277
## 10 aparilu01 10230
## # ... with 13,878 more rows
# Find the distinct players that appear in HallOfFame
nominated <- HallOfFame %>%
select(playerID) %>%
distinct()
nominated %>%
# Count the number of players in nominated
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 1239
nominated_full <- nominated %>%
# Join to Master
left_join(Master, by="playerID") %>%
# Return playerID, nameFirst, nameLast
select(playerID, nameFirst, nameLast)
# Find distinct players in HallOfFame with inducted == "Y"
inducted <- HallOfFame %>%
filter(inducted == "Y") %>%
select(playerID) %>%
distinct()
inducted %>%
# Count the number of players in nominated
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 312
inducted_full <- inducted %>%
# Join to Master
left_join(Master, by="playerID") %>%
# Return playerID, nameFirst, nameLast
select(playerID, nameFirst, nameLast)
# Tally the number of awards in AwardsPlayers by playerID
nAwards <- AwardsPlayers %>%
group_by(playerID) %>%
tally()
nAwards %>%
# Filter to just the players in inducted
semi_join(inducted, by="playerID") %>%
# Calculate the mean number of awards per player
summarize(avg_n=mean(n, na.rm=TRUE))
## # A tibble: 1 × 1
## avg_n
## <dbl>
## 1 12.10582
nAwards %>%
# Filter to just the players in nominated
semi_join(nominated, by="playerID") %>%
# Filter to players NOT in inducted
anti_join(inducted, by="playerID") %>%
# Calculate the mean number of awards per player
summarize(avg_n=mean(n, na.rm=TRUE))
## # A tibble: 1 × 1
## avg_n
## <dbl>
## 1 4.18985
# Find the players who are in nominated, but not inducted
notInducted <- nominated %>%
setdiff(inducted)
Salaries %>%
# Find the players who are in notInducted
semi_join(notInducted, by="playerID") %>%
# Calculate the max salary by player
group_by(playerID) %>%
summarize(max_salary=max(salary)) %>%
# Calculate the average of the max salaries
summarize(avg_salary=mean(max_salary))
## # A tibble: 1 × 1
## avg_salary
## <dbl>
## 1 4876812
# Repeat for players who were inducted
Salaries %>%
semi_join(inducted, by="playerID") %>%
group_by(playerID) %>%
summarize(max_salary=max(salary)) %>%
summarize(avg_salary=mean(max_salary))
## # A tibble: 1 × 1
## avg_salary
## <dbl>
## 1 5673190
Appearances %>%
# Filter Appearances against nominated
semi_join(nominated, by="playerID") %>%
# Find last year played by player
group_by(playerID) %>%
summarize(last_year=max(yearID)) %>%
# Join to full HallOfFame
left_join(HallOfFame, by="playerID") %>%
# Filter for unusual observations
filter((yearID - last_year) < 1)
## # A tibble: 39 × 10
## playerID last_year yearID votedBy ballots needed votes
## <chr> <int> <int> <chr> <int> <int> <int>
## 1 cissebi01 1938 1937 BBWAA 201 151 1
## 2 cochrmi01 1937 1936 BBWAA 226 170 80
## 3 deandi01 1947 1945 BBWAA 247 186 17
## 4 deandi01 1947 1946 Final Ballot 263 198 45
## 5 deandi01 1947 1946 Nominating Vote 202 NA 40
## 6 deandi01 1947 1947 BBWAA 161 121 88
## 7 dickebi01 1946 1945 BBWAA 247 186 17
## 8 dickebi01 1946 1946 Nominating Vote 202 NA 40
## 9 dickebi01 1946 1946 Final Ballot 263 198 32
## 10 dimagjo01 1951 1945 BBWAA 247 186 1
## # ... with 29 more rows, and 3 more variables: inducted <fctr>,
## # category <fctr>, needed_note <chr>
The data.table library is designed to simplify and speed up work with large datasets. The language is broadly analogous to SQL, with syntax that includes equivalents for SELECT, WHERE, and GROUP BY. Some general attributes of a data.table object include:
NOTE - all data.table are also data.frame, and if a package is not aware of data.table, then it will act as data.frame for that package.
General syntax is:
Example table creation:
Some example code includes:
library(data.table)
DT <- data.table(a = c(1, 2), b=LETTERS[1:4])
str(DT)
## Classes 'data.table' and 'data.frame': 4 obs. of 2 variables:
## $ a: num 1 2 1 2
## $ b: chr "A" "B" "C" "D"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## a b
## 1: 1 A
## 2: 2 B
## 3: 1 C
## 4: 2 D
# Print the second to last row of DT using .N
DT[.N-1]
## a b
## 1: 1 C
# Print the column names of DT
names(DT)
## [1] "a" "b"
# Print the number or rows and columns of DT
dim(DT)
## [1] 4 2
# Select row 2 twice and row 3, returning a data.table with three rows where row 2 is a duplicate of row 1.
DT[c(2, 2:3)]
## a b
## 1: 2 B
## 2: 2 B
## 3: 1 C
DT <- data.table(A = 1:5, B = letters[1:5], C = 6:10)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 3 variables:
## $ A: int 1 2 3 4 5
## $ B: chr "a" "b" "c" "d" ...
## $ C: int 6 7 8 9 10
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: 1 a 6
## 2: 2 b 7
## 3: 3 c 8
## 4: 4 d 9
## 5: 5 e 10
# Subset rows 1 and 3, and columns B and C
DT[c(1, 3), .(B, C)]
## B C
## 1: a 6
## 2: c 8
# Assign to ans the correct value
ans <- DT[ , .(B, val=A*C)]
ans
## B val
## 1: a 6
## 2: b 14
## 3: c 24
## 4: d 36
## 5: e 50
# Fill in the blanks such that ans2 equals target
target <- data.table(B = c("a", "b", "c", "d", "e", "a", "b", "c", "d", "e"),
val = as.integer(c(6:10, 1:5))
)
ans2 <- DT[, .(B, val = c(C, A))]
identical(target, ans2)
## [1] TRUE
DT <- as.data.table(iris)
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# For each Species, print the mean Sepal.Length
DT[ , mean(Sepal.Length), Species]
## Species V1
## 1: setosa 5.006
## 2: versicolor 5.936
## 3: virginica 6.588
# Print mean Sepal.Length, grouping by first letter of Species
DT[ , mean(Sepal.Length), substr(Species, 1, 1)]
## substr V1
## 1: s 5.006
## 2: v 6.262
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
identical(DT, as.data.table(iris))
## [1] TRUE
# Group the specimens by Sepal area (to the nearest 10 cm2) and count how many occur in each group.
DT[, .N, by = 10 * round(Sepal.Length * Sepal.Width / 10)]
## round N
## 1: 20 117
## 2: 10 29
## 3: 30 4
# Now name the output columns `Area` and `Count`
DT[, .(Count=.N), by = .(Area = 10 * round(Sepal.Length * Sepal.Width / 10))]
## Area Count
## 1: 20 117
## 2: 10 29
## 3: 30 4
# Create the data.table DT
set.seed(1L)
DT <- data.table(A = rep(letters[2:1], each = 4L),
B = rep(1:4, each = 2L),
C = sample(8)
)
str(DT)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 8 4 5 1 7 2 6
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: b 1 3
## 2: b 1 8
## 3: b 2 4
## 4: b 2 5
## 5: a 3 1
## 6: a 3 7
## 7: a 4 2
## 8: a 4 6
# Create the new data.table, DT2
DT2 <- DT[, .(C = cumsum(C)), by = .(A, B)]
str(DT2)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 11 4 9 1 8 2 8
## - attr(*, ".internal.selfref")=<externalptr>
DT2
## A B C
## 1: b 1 3
## 2: b 1 11
## 3: b 2 4
## 4: b 2 9
## 5: a 3 1
## 6: a 3 8
## 7: a 4 2
## 8: a 4 8
# Select from DT2 the last two values from C while you group by A
DT2[, .(C = tail(C, 2)), by = A]
## A C
## 1: b 4
## 2: b 9
## 3: a 2
## 4: a 8
The chaining operation in data.table is run as [statement][next statement].
Example code includes:
set.seed(1L)
DT <- data.table(A = rep(letters[2:1], each = 4L),
B = rep(1:4, each = 2L),
C = sample(8))
str(DT)
## Classes 'data.table' and 'data.frame': 8 obs. of 3 variables:
## $ A: chr "b" "b" "b" "b" ...
## $ B: int 1 1 2 2 3 3 4 4
## $ C: int 3 8 4 5 1 7 2 6
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: b 1 3
## 2: b 1 8
## 3: b 2 4
## 4: b 2 5
## 5: a 3 1
## 6: a 3 7
## 7: a 4 2
## 8: a 4 6
# Perform operation using chaining
DT[ , .(C = cumsum(C)), by = .(A, B)][ , .(C = tail(C, 2)), by=.(A)]
## A C
## 1: b 4
## 2: b 9
## 3: a 2
## 4: a 8
data(iris)
DT <- as.data.table(iris)
str(DT)
## Classes 'data.table' and 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Perform chained operations on DT
DT[ , .(Sepal.Length = median(Sepal.Length), Sepal.Width = median(Sepal.Width),
Petal.Length = median(Petal.Length), Petal.Width = median(Petal.Width)),
by=.(Species)][order(-Species)]
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1: virginica 6.5 3.0 5.55 2.0
## 2: versicolor 5.9 2.8 4.35 1.3
## 3: setosa 5.0 3.4 1.50 0.2
# Mean of columns
# DT[ , lapply(.SD, FUN=mean), by=.(x)]
# Median of columns
# DT[ , lapply(.SD, FUN=median), by=.(x)]
# Calculate the sum of the Q columns
# DT[ , lapply(.SD, FUN=sum), , .SDcols=2:4]
# Calculate the sum of columns H1 and H2
# DT[ , lapply(.SD, FUN=sum), , .SDcols=paste0("H", 1:2)]
# Select all but the first row of groups 1 and 2, returning only the grp column and the Q columns
# foo = function(x) { x[-1] }
# DT[ , lapply(.SD, FUN=foo), by=.(grp), .SDcols=paste0("Q", 1:3)]
# Sum of all columns and the number of rows
# DT[, c(lapply(.SD, FUN=sum), .N), by=.(x), .SDcols=names(DT)]
# Cumulative sum of column x and y while grouping by x and z > 8
# DT[, lapply(.SD, FUN=cumsum), by=.(by1=x, by2=(z>8)), .SDcols=c("x", "y")]
# Chaining
# DT[, lapply(.SD, FUN=cumsum), by=.(by1=x, by2=(z>8)), .SDcols=c("x", "y")][ , lapply(.SD, FUN=max), by=.(by1), .SDcols=c("x", "y")]
# The data.table DT
DT <- data.table(A = letters[c(1, 1, 1, 2, 2)], B = 1:5)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ A: chr "a" "a" "a" "b" ...
## $ B: int 1 2 3 4 5
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B
## 1: a 1
## 2: a 2
## 3: a 3
## 4: b 4
## 5: b 5
# Add column by reference: Total
DT[ , Total:=sum(B), by=.(A)]
DT
## A B Total
## 1: a 1 6
## 2: a 2 6
## 3: a 3 6
## 4: b 4 9
## 5: b 5 9
# Add 1 to column B
DT[c(2,4) , B:=B+1L, ]
DT
## A B Total
## 1: a 1 6
## 2: a 3 6
## 3: a 3 6
## 4: b 5 9
## 5: b 5 9
# Add a new column Total2
DT[2:4, Total2:=sum(B), by=.(A)]
DT
## A B Total Total2
## 1: a 1 6 NA
## 2: a 3 6 6
## 3: a 3 6 6
## 4: b 5 9 5
## 5: b 5 9 NA
# Remove the Total column
DT[ , Total := NULL, ]
DT
## A B Total2
## 1: a 1 NA
## 2: a 3 6
## 3: a 3 6
## 4: b 5 5
## 5: b 5 NA
# Select the third column using `[[`
DT[[3]]
## [1] NA 6 6 5 NA
# A data.table DT has been created for you
DT <- data.table(A = c(1, 1, 1, 2, 2), B = 1:5)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ A: num 1 1 1 2 2
## $ B: int 1 2 3 4 5
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B
## 1: 1 1
## 2: 1 2
## 3: 1 3
## 4: 2 4
## 5: 2 5
# Update B, add C and D
DT[ , c("B", "C", "D") := .(B + 1, A + B, 2), ]
DT
## A B C D
## 1: 1 2 2 2
## 2: 1 3 3 2
## 3: 1 4 4 2
## 4: 2 5 6 2
## 5: 2 6 7 2
# Delete my_cols
my_cols <- c("B", "C")
DT[ , (my_cols) := NULL, ]
DT
## A D
## 1: 1 2
## 2: 1 2
## 3: 1 2
## 4: 2 2
## 5: 2 2
# Delete column 2 by number
DT[[2]] <- NULL
DT
## A
## 1: 1
## 2: 1
## 3: 1
## 4: 2
## 5: 2
# Set the seed
# set.seed(1)
# Check the DT that is made available to you
# DT
# For loop with set
# for(i in 2:4) { set(DT, sample(nrow(DT), 3), i, NA) }
# Change the column names to lowercase
# setnames(DT, letters[1:4])
# Print the resulting DT to the console
# DT
# Define DT
DT <- data.table(a = letters[c(1, 1, 1, 2, 2)], b = 1)
str(DT)
## Classes 'data.table' and 'data.frame': 5 obs. of 2 variables:
## $ a: chr "a" "a" "a" "b" ...
## $ b: num 1 1 1 1 1
## - attr(*, ".internal.selfref")=<externalptr>
DT
## a b
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Add a suffix "_2" to all column names
setnames(DT, paste0(names(DT), "_2"))
DT
## a_2 b_2
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Change column name "a_2" to "A2"
setnames(DT, "a_2", "A2")
DT
## A2 b_2
## 1: a 1
## 2: a 1
## 3: a 1
## 4: b 1
## 5: b 1
# Reverse the order of the columns
setcolorder(DT, 2:1)
DT
## b_2 A2
## 1: 1 a
## 2: 1 a
## 3: 1 a
## 4: 1 b
## 5: 1 b
Example code includes:
# iris as a data.table
iris <- as.data.table(iris)
# Remove the "Sepal." prefix
names(iris) <- gsub("Sepal\\.", "", names(iris))
# Remove the two columns starting with "Petal"
iris[, c("Petal.Length", "Petal.Width") := NULL, ]
# Cleaned up iris data.table
str(iris)
## Classes 'data.table' and 'data.frame': 150 obs. of 3 variables:
## $ Length : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Area is greater than 20 square centimeters
iris[ Width * Length > 20 ]
## Length Width Species
## 1: 5.4 3.9 setosa
## 2: 5.8 4.0 setosa
## 3: 5.7 4.4 setosa
## 4: 5.4 3.9 setosa
## 5: 5.7 3.8 setosa
## 6: 5.2 4.1 setosa
## 7: 5.5 4.2 setosa
## 8: 7.0 3.2 versicolor
## 9: 6.4 3.2 versicolor
## 10: 6.9 3.1 versicolor
## 11: 6.3 3.3 versicolor
## 12: 6.7 3.1 versicolor
## 13: 6.7 3.0 versicolor
## 14: 6.0 3.4 versicolor
## 15: 6.7 3.1 versicolor
## 16: 6.3 3.3 virginica
## 17: 7.1 3.0 virginica
## 18: 7.6 3.0 virginica
## 19: 7.3 2.9 virginica
## 20: 7.2 3.6 virginica
## 21: 6.5 3.2 virginica
## 22: 6.8 3.0 virginica
## 23: 6.4 3.2 virginica
## 24: 7.7 3.8 virginica
## 25: 7.7 2.6 virginica
## 26: 6.9 3.2 virginica
## 27: 7.7 2.8 virginica
## 28: 6.7 3.3 virginica
## 29: 7.2 3.2 virginica
## 30: 7.2 3.0 virginica
## 31: 7.4 2.8 virginica
## 32: 7.9 3.8 virginica
## 33: 7.7 3.0 virginica
## 34: 6.3 3.4 virginica
## 35: 6.9 3.1 virginica
## 36: 6.7 3.1 virginica
## 37: 6.9 3.1 virginica
## 38: 6.8 3.2 virginica
## 39: 6.7 3.3 virginica
## 40: 6.7 3.0 virginica
## 41: 6.2 3.4 virginica
## Length Width Species
# Add new boolean column
iris[, is_large := Width * Length > 25]
## Warning in `[.data.table`(iris, , `:=`(is_large, Width * Length > 25)):
## Invalid .internal.selfref detected and fixed by taking a (shallow) copy
## of the data.table so that := can add this new column by reference. At
## an earlier point, this data.table has been copied by R (or been created
## manually using structure() or similar). Avoid key<-, names<- and attr<-
## which in R currently (and oddly) may copy the whole data.table. Use set*
## syntax instead to avoid copying: ?set, ?setnames and ?setattr. Also, in
## R<=v3.0.2, list(DT1,DT2) copied the entire DT1 and DT2 (R's list() used to
## copy named objects); please upgrade to R>v3.0.2 if that is biting. If this
## message doesn't help, please report to datatable-help so the root cause can
## be fixed.
# Now large observations with is_large
iris[is_large == TRUE]
## Length Width Species is_large
## 1: 5.7 4.4 setosa TRUE
## 2: 7.2 3.6 virginica TRUE
## 3: 7.7 3.8 virginica TRUE
## 4: 7.9 3.8 virginica TRUE
iris[(is_large)] # Also OK
## Length Width Species is_large
## 1: 5.7 4.4 setosa TRUE
## 2: 7.2 3.6 virginica TRUE
## 3: 7.7 3.8 virginica TRUE
## 4: 7.9 3.8 virginica TRUE
# The 'keyed' data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12)
setkey(DT, A, B)
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "A" "B"
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Select the "b" group
DT["b"]
## A B C
## 1: b 1 8
## 2: b 5 6
## 3: b 8 11
# "b" and "c" groups
DT[c("b", "c")]
## A B C
## 1: b 1 8
## 2: b 5 6
## 3: b 8 11
## 4: c 6 12
## 5: c 9 9
# The first row of the "b" and "c" groups
DT[c("b", "c"), mult = "first"]
## A B C
## 1: b 1 8
## 2: c 6 12
# First and last row of the "b" and "c" groups
DT[c("b", "c"), .SD[c(1, .N)], by = .EACHI]
## A B C
## 1: b 1 8
## 2: b 8 11
## 3: c 6 12
## 4: c 9 9
# Copy and extend code for instruction 4: add printout
DT[c("b", "c"), { print(.SD); .SD[c(1, .N)] }, by = .EACHI]
## B C
## 1: 1 8
## 2: 5 6
## 3: 8 11
## B C
## 1: 6 12
## 2: 9 9
## A B C
## 1: b 1 8
## 2: b 8 11
## 3: c 6 12
## 4: c 9 9
# Keyed data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12,
key = "A,B")
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, "sorted")= chr "A" "B"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Get the key of DT
key(DT)
## [1] "A" "B"
# Row where A == "b" and B == 6
DT[.("b", 6)]
## A B C
## 1: b 6 NA
# Return the prevailing row
DT[.("b", 6), roll=TRUE]
## A B C
## 1: b 6 6
# Return the nearest row
DT[.("b", 6), roll="nearest"]
## A B C
## 1: b 6 6
# Keyed data.table DT
DT <- data.table(A = letters[c(2, 1, 2, 3, 1, 2, 3)],
B = c(5, 4, 1, 9, 8, 8, 6),
C = 6:12,
key = "A,B")
str(DT)
## Classes 'data.table' and 'data.frame': 7 obs. of 3 variables:
## $ A: chr "a" "a" "b" "b" ...
## $ B: num 4 8 1 5 8 6 9
## $ C: int 7 10 8 6 11 12 9
## - attr(*, "sorted")= chr "A" "B"
## - attr(*, ".internal.selfref")=<externalptr>
DT
## A B C
## 1: a 4 7
## 2: a 8 10
## 3: b 1 8
## 4: b 5 6
## 5: b 8 11
## 6: c 6 12
## 7: c 9 9
# Print the sequence (-2):10 for the "b" group
DT[.("b", (-2):10)]
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 NA
## 6: b 3 NA
## 7: b 4 NA
## 8: b 5 6
## 9: b 6 NA
## 10: b 7 NA
## 11: b 8 11
## 12: b 9 NA
## 13: b 10 NA
# Add code: carry the prevailing values forwards
DT[.("b", (-2):10), roll=TRUE]
## A B C
## 1: b -2 NA
## 2: b -1 NA
## 3: b 0 NA
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11
# Add code: carry the first observation backwards
DT[.("b", (-2):10), roll=TRUE, rollends=TRUE]
## A B C
## 1: b -2 8
## 2: b -1 8
## 3: b 0 8
## 4: b 1 8
## 5: b 2 8
## 6: b 3 8
## 7: b 4 8
## 8: b 5 6
## 9: b 6 6
## 10: b 7 6
## 11: b 8 11
## 12: b 9 11
## 13: b 10 11
Jeff Ryan, the creator of quantmod and organizer of the R/Finance conference, has developed xts and zoo to simplify working with time series data. The course will cover five areas (chapters):
“xts” stands for extensible time series. The core of each “xts” is a “zoo” object, consisting of a matrix plus an index.
There are a few special behaviors of xts:
The “xts” object can be de-constructed when needed:
Data usually already exists and needs to be “wrangled” in to a proper format for xts/zoo. The easiest way to convert is using as.xts(). You can coerce truly external data after loading it, and can also save data with Can also save with write.zoo(x, “file”).
Subsetting based on time is a particular strength of xts. xts supports ISO8601:2004 (the standard, “right way”, to unambiguously consider times):
xts allows for four methods of specifying dates or intervals:
Can also use some traditional R-like methods (since xts extends zoo, and zoo extends base R):
Can set the flag which.i = TRUE to get back the correct records (row numbers). For example, index <- x[“2007-06-26/2007-06-28”, which.i = TRUE].
Description of key behaviors when working with an xts object:
xts introduces a few relatives of the head() and tail() functionality. These are the first() and last() functions.
Math operations using xts - xts is a matrix - need to be careful about matrix operations. Math operations are run only on the intersection of items:
Merging time series is common. Merge (cbind, merge) combines by columns, but joining based on index.
Merge (rbind( combine by rows, though all rows must already have an index. Basically, the rbind MUST be used on a time series.
Missing data is common, and xts inherits all of the zoo methods for dealing with missing data. The locf is the “last observation carry forward” (latest value that is not NA) - called with na.locf:
The NA can be managed in several ways:
Lag operators and difference operations. Seasonality is a repeating pattern. There is often a need to compare seasonality – for example, compare Mondays. Stationarity refers to some bound of the series.
The lag() function will change the timestamp, so that (for example) today can be merged as last week:
The “one period lag first difference” is calculated as diff(x, lag=1, differences=1, arithmetic=TRUE, log=FALSE, na.pad=TRUE, . ).
There are two main approaches for applying functions on discrete periods or intervals:
Time series aggregation can also be handled by xts:
Time series data can also be managed in a “rolling” manner - discrete or continuous:
Internals of xts such as indices and timezones:
Final topics:
Example code includes (cached to avoid future internet calls):
library(xts)
library(zoo)
x <- matrix(data=1:4, ncol=2)
idx <- as.Date(c("2015-01-01", "2015-02-01"))
# Create the xts
X <- xts(x, order.by = idx)
# Decosntruct the xts
coredata(X, fmt=FALSE)
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
index(X)
## [1] "2015-01-01" "2015-02-01"
# Working with the sunspots data
data(sunspots)
class(sunspots)
## [1] "ts"
sunspots_xts <- as.xts(sunspots)
class(sunspots_xts)
## [1] "xts" "zoo"
head(sunspots_xts)
## [,1]
## Jan 1749 58.0
## Feb 1749 62.6
## Mar 1749 70.0
## Apr 1749 55.7
## May 1749 85.0
## Jun 1749 83.5
# Example from chapter #1
ex_matrix <- xts(matrix(data=c(1, 1, 1, 2, 2, 2), ncol=2),
order.by=as.Date(c("2016-06-01", "2016-06-02", "2016-06-03"))
)
core <- coredata(ex_matrix)
# View the structure of ex_matrix
str(ex_matrix)
## An 'xts' object on 2016-06-01/2016-06-03 containing:
## Data: num [1:3, 1:2] 1 1 1 2 2 2
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
# Extract the 3rd observation of the 2nd column of ex_matrix
ex_matrix[3, 2]
## [,1]
## 2016-06-03 2
# Extract the 3rd observation of the 2nd column of core
core[3, 2]
## [1] 2
# Create the object data using 5 random numbers
data <- rnorm(5)
# Create dates as a Date class object starting from 2016-01-01
dates <- seq(as.Date("2016-01-01"), length = 5, by = "days")
# Use xts() to create smith
smith <- xts(x = data, order.by = dates)
# Create bday (1899-05-08) using a POSIXct date class object
bday <- as.POSIXct("1899-05-08")
# Create hayek and add a new attribute called born
hayek <- xts(x = data, order.by = dates, born = bday)
# Extract the core data of hayek
hayek_core <- coredata(hayek)
# View the class of hayek_core
class(hayek_core)
## [1] "matrix"
# Extract the index of hayek
hayek_index <- index(hayek)
# View the class of hayek_index
class(hayek_index)
## [1] "Date"
# Create dates
dates <- as.Date("2016-01-01") + 0:4
# Create ts_a
ts_a <- xts(x = 1:5, order.by = dates)
# Create ts_b
ts_b <- xts(x = 1:5, order.by = as.POSIXct(dates))
# Extract the rows of ts_a using the index of ts_b
ts_a[index(ts_b)]
## [,1]
## 2016-01-01 1
## 2016-01-02 2
## 2016-01-03 3
## 2016-01-04 4
## 2016-01-05 5
# Extract the rows of ts_b using the index of ts_a
ts_b[index(ts_a)]
## [,1]
data(austres)
# Convert austres to an xts object called au
au <- as.xts(austres)
# Convert your xts object (au) into a matrix am
am <- as.matrix(au)
# Convert the original austres into a matrix am2
am2 <- as.matrix(austres)
# Create dat by reading tmp_file
tmp_file <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1127/datasets/tmp_file.csv"
dat <- read.csv(tmp_file)
# Convert dat into xts
xts(dat, order.by = as.Date(rownames(dat), "%m/%d/%Y"))
## a b
## 2015-01-02 1 3
## 2015-02-03 2 4
# Read tmp_file using read.zoo
dat_zoo <- read.zoo(tmp_file, index.column = 0, sep = ",", format = "%m/%d/%Y")
# Convert dat_zoo to xts
dat_xts <- as.xts(dat_zoo)
# Convert sunspots to xts using as.xts(). Save this as sunspots_xts
sunspots_xts <- as.xts(sunspots)
# Get the temporary file name
tmp <- tempfile()
# Write the xts object using zoo to tmp
write.zoo(sunspots_xts, sep = ",", file = tmp)
# Read the tmp file. FUN = as.yearmon converts strings such as Jan 1749 into a proper time class
sun <- read.zoo(tmp, sep = ",", FUN = as.yearmon)
# Convert sun into xts. Save this as sun_xts
sun_xts <- as.xts(sun)
data(edhec, package="PerformanceAnalytics")
head(edhec["2007-01", 1])
## Convertible Arbitrage
## 2007-01-31 0.013
head(edhec["2007-01/2007-03", 1])
## Convertible Arbitrage
## 2007-01-31 0.0130
## 2007-02-28 0.0117
## 2007-03-31 0.0060
head(edhec["200701/03", 1])
## Convertible Arbitrage
## 2007-01-31 0.0130
## 2007-02-28 0.0117
## 2007-03-31 0.0060
first(edhec[, "Funds of Funds"], "4 months")
## Funds of Funds
## 1997-01-31 0.0317
## 1997-02-28 0.0106
## 1997-03-31 -0.0077
## 1997-04-30 0.0009
last(edhec[, "Funds of Funds"], "1 year")
## Funds of Funds
## 2009-01-31 0.0060
## 2009-02-28 -0.0037
## 2009-03-31 0.0008
## 2009-04-30 0.0092
## 2009-05-31 0.0312
## 2009-06-30 0.0024
## 2009-07-31 0.0153
## 2009-08-31 0.0113
Chapter 1 - Data cleaning and summarization - ggplot2, dplyr, real-world dataset
United Nations dataset - voting history, from a scenario where every nation gets a vote:
Grouping and Summarizing - make the dataset manageable:
Sorting and filtering summarized data:
Example code includes:
# Grab only the sessions that are even numbered, then double-check that the list is unique by rcid
evenSessions <- unvotes::un_roll_calls %>%
filter(session %% 2 == 0)
nrow(evenSessions) == nrow(evenSessions %>%
select(rcid) %>%
distinct()
)
## [1] TRUE
# Double check that un_votes is unique on rcid-country, then inner_join the evenSessions file
nrow(unvotes::un_votes) == nrow(unvotes::un_votes %>%
select(rcid, country) %>%
distinct()
)
## [1] TRUE
baseData <- unvotes::un_votes %>%
inner_join(evenSessions, by="rcid") %>%
select(rcid, session, vote, country)
str(baseData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 353720 obs. of 4 variables:
## $ rcid : atomic 46 46 46 46 46 46 46 46 46 46 ...
## ..- attr(*, "comment")= chr "rcid"
## $ session: num 2 2 2 2 2 2 2 2 2 2 ...
## $ vote : Factor w/ 3 levels "abstain","no",..: 3 3 3 2 3 3 1 3 2 3 ...
## $ country: chr "Paraguay" "Honduras" "Luxembourg" "Poland" ...
# Create the 1-2-3 system where 1=yes, 2=abstain, and 3=no
chrVotes <- as.character(baseData$vote)
fctVotes <- factor(chrVotes, levels=c("yes", "abstain", "no"))
intVotes <- as.integer(fctVotes)
table(chrVotes, intVotes) # confirm that 1=yes, 2=abstain, 3=no
## intVotes
## chrVotes 1 2 3
## abstain 0 45444 0
## no 0 0 25344
## yes 282932 0 0
baseData <- baseData %>%
mutate(oldFctVote = vote, vote=intVotes)
str(baseData) # 353,720 x 4
## Classes 'tbl_df', 'tbl' and 'data.frame': 353720 obs. of 5 variables:
## $ rcid : atomic 46 46 46 46 46 46 46 46 46 46 ...
## ..- attr(*, "comment")= chr "rcid"
## $ session : num 2 2 2 2 2 2 2 2 2 2 ...
## $ vote : int 1 1 1 3 1 1 2 1 3 1 ...
## $ country : chr "Paraguay" "Honduras" "Luxembourg" "Poland" ...
## $ oldFctVote: Factor w/ 3 levels "abstain","no",..: 3 3 3 2 3 3 1 3 2 3 ...
# Create the full table of all combinations of rcid-session x country
# (so that votes can be entered there as either 9-not member or 8-not present)
uqVotes <- distinct(baseData[,c("rcid", "session")]) # 2,590 x 2
uqCountry <- distinct(baseData[,c("country"),drop=FALSE]) # 200x1
uqVotes$dummy <- 1L
uqCountry$dummy <- 1L
uqVoteCountry <- full_join(uqVotes, uqCountry, by="dummy") # 518,000 x 4 (rcid-session-dummy-country)
missVoteCountry <- uqVoteCountry %>%
select(-dummy) %>%
setdiff(select(baseData, -vote, -oldFctVote)) # 164,280 x 3 (rcid-session-country)
# Create the unique list of session-country
# (countries that voted at least once in a session will be assumed
# to have been not members at any votes missed in that session)
uqSessionCountry <- baseData %>%
select(session, country) %>%
distinct() # 4,744 x 2
nmVoteCountry <- missVoteCountry %>%
anti_join(uqSessionCountry, by=c("session", "country")) # 132,147 x 3 (rcid-session-country)
npVoteCountry <- missVoteCountry %>%
semi_join(uqSessionCountry, by=c("session", "country")) # 32,133 x 3 (rcid-session-country)
# Bind the rows together, noting their sources for the record
unvotes <- bind_rows(baseData,
mutate(nmVoteCountry, vote=9, oldFctVote=NA),
mutate(npVoteCountry, vote=8, oldFctVote=NA),
.id="source"
) # 518,000 x 6 (source-rcid-session-vote-country-oldFctVote)
# Put the UN code on them (the unvotes datauses the Correlates of War Number, variable "cown")
missCountry <- uqCountry %>%
select(-dummy) %>%
anti_join(countrycode::countrycode_data, by=c("country" = "country.name.en"))
reMap <- c(
"Bolivia, Plurinational State of"="Bolivia (Plurinational State of)",
"Congo, the Democratic Republic of the"="Democratic Republic of the Congo",
"Cote d'Ivoire"="Côte D'Ivoire",
"Gambia"="Gambia (Islamic Republic of the)",
"Guinea-Bissau"="Guinea Bissau",
"Iran, Islamic Republic of"="Iran (Islamic Republic of)",
"Korea, Democratic People's Republic of"="Democratic People's Republic of Korea",
"Korea, Republic of"="Republic of Korea",
"Macedonia, the former Yugoslav Republic of"="The former Yugoslav Republic of Macedonia",
"Micronesia, Federated States of"="Micronesia (Federated States of)",
"Moldova, Republic of"="Republic of Moldova",
"Tanzania, United Republic of"="United Republic of Tanzania",
"United Kingdom"="United Kingdom of Great Britain and Northern Ireland",
"United States"="United States of America"
)
mapMissCountry <- missCountry %>%
mutate(newCountry=reMap[country]) %>%
left_join(select(countrycode::countrycode_data, country.name.en, iso3n, un, cown),
by=c("newCountry" = "country.name.en")
)
mapOKCountry <- uqCountry %>%
select(-dummy) %>%
inner_join(select(countrycode::countrycode_data, country.name.en, iso3n, un, cown),
by=c("country" = "country.name.en")
)
mapCountry <- mapMissCountry %>%
select(country, iso3n, un, cown) %>%
bind_rows(mapOKCountry) # 200 x 2
mapCountry[duplicated(mapCountry$cown), ] # no countries
## # A tibble: 0 × 4
## # ... with 4 variables: country <chr>, iso3n <int>, un <int>, cown <int>
# Place the cown code on the unvotes dataset as ccode, and delete the records where it is NA
unvotes %>%
anti_join(mapCountry, by=c("country")) # None, as it should be
## # A tibble: 0 × 6
## # ... with 6 variables: source <chr>, rcid <dbl>, session <dbl>,
## # vote <dbl>, country <chr>, oldFctVote <fctr>
unvotes_tmp <- unvotes %>%
left_join(mapCountry, by=c("country"))
votes <- unvotes_tmp %>%
filter(!is.na(cown)) %>%
mutate(ccode=cown) %>%
select(rcid, session, vote, ccode) # 518,000 (200 iso3n x 2,590 votes) x 4 (rcid-session-vote-ccode)
# Now can actually run the process on the newly created "votes" dataset
# Print the votes dataset
votes
## # A tibble: 515,410 × 4
## rcid session vote ccode
## <dbl> <dbl> <dbl> <int>
## 1 46 2 1 150
## 2 46 2 1 91
## 3 46 2 1 212
## 4 46 2 3 290
## 5 46 2 1 900
## 6 46 2 1 140
## 7 46 2 2 530
## 8 46 2 1 840
## 9 46 2 3 365
## 10 46 2 1 160
## # ... with 515,400 more rows
# Filter for only votes that are "yes", "abstain", or "no"
votes %>% filter(vote <= 3)
## # A tibble: 351,529 × 4
## rcid session vote ccode
## <dbl> <dbl> <dbl> <int>
## 1 46 2 1 150
## 2 46 2 1 91
## 3 46 2 1 212
## 4 46 2 3 290
## 5 46 2 1 900
## 6 46 2 1 140
## 7 46 2 2 530
## 8 46 2 1 840
## 9 46 2 3 365
## 10 46 2 1 160
## # ... with 351,519 more rows
# Add another %>% step to add a year column
votes %>%
filter(vote <= 3) %>%
mutate(year=1945+session)
## # A tibble: 351,529 × 5
## rcid session vote ccode year
## <dbl> <dbl> <dbl> <int> <dbl>
## 1 46 2 1 150 1947
## 2 46 2 1 91 1947
## 3 46 2 1 212 1947
## 4 46 2 3 290 1947
## 5 46 2 1 900 1947
## 6 46 2 1 140 1947
## 7 46 2 2 530 1947
## 8 46 2 1 840 1947
## 9 46 2 3 365 1947
## 10 46 2 1 160 1947
## # ... with 351,519 more rows
# Convert country code 100
countrycode::countrycode(100, "cown", "country.name")
## [1] "Colombia"
# Add a country column within the mutate: votes_processed
votes_processed <- votes %>%
filter(vote <= 3) %>%
mutate(year = session + 1945,
country = countrycode::countrycode(ccode, "cown", "country.name")
)
# Print votes_processed
votes_processed
## # A tibble: 351,529 × 6
## rcid session vote ccode year country
## <dbl> <dbl> <dbl> <int> <dbl> <chr>
## 1 46 2 1 150 1947 Paraguay
## 2 46 2 1 91 1947 Honduras
## 3 46 2 1 212 1947 Luxembourg
## 4 46 2 3 290 1947 Poland
## 5 46 2 1 900 1947 Australia
## 6 46 2 1 140 1947 Brazil
## 7 46 2 2 530 1947 Ethiopia
## 8 46 2 1 840 1947 Philippines
## 9 46 2 3 365 1947 Russian Federation
## 10 46 2 1 160 1947 Argentina
## # ... with 351,519 more rows
# Find total and fraction of "yes" votes
votes_processed %>%
summarize(total=n(), percent_yes=mean(vote==1))
## # A tibble: 1 × 2
## total percent_yes
## <int> <dbl>
## 1 351529 0.7997719
# Change this code to summarize by year
votes_processed %>%
group_by(year) %>%
summarize(total = n(),
percent_yes = mean(vote == 1)
)
## # A tibble: 34 × 3
## year total percent_yes
## <dbl> <int> <dbl>
## 1 1947 2039 0.5693968
## 2 1949 3469 0.4375901
## 3 1951 1434 0.5850767
## 4 1953 1537 0.6317502
## 5 1955 2169 0.6947902
## 6 1957 2708 0.6085672
## 7 1959 4326 0.5880721
## 8 1961 7417 0.5726035
## 9 1963 3277 0.7296308
## 10 1965 4341 0.7065192
## # ... with 24 more rows
# Summarize by country: by_country
by_country <- votes_processed %>%
group_by(country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1)
)
# Print the by_country dataset
by_country
## # A tibble: 199 × 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 Afghanistan 2373 0.8592499
## 2 Albania 1696 0.7169811
## 3 Algeria 2214 0.8992773
## 4 Andorra 720 0.6375000
## 5 Angola 1432 0.9238827
## 6 Antigua and Barbuda 1303 0.9125096
## 7 Argentina 2554 0.7678152
## 8 Armenia 758 0.7467018
## 9 Australia 2576 0.5562888
## 10 Austria 2390 0.6221757
## # ... with 189 more rows
# Sort in ascending order of percent_yes
by_country %>%
arrange(percent_yes)
## # A tibble: 199 × 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 Zanzibar 2 0.0000000
## 2 United States of America 2569 0.2693655
## 3 Palau 370 0.3378378
## 4 Israel 2381 0.3406132
## 5 Federal Republic of Germany 1075 0.3972093
## 6 United Kingdom of Great Britain and Northern Ireland 2559 0.4165690
## 7 France 2528 0.4264241
## 8 Micronesia (Federated States of) 724 0.4419890
## 9 Marshall Islands 757 0.4914135
## 10 Belgium 2569 0.4920202
## # ... with 189 more rows
# Now sort in descending order
by_country %>%
arrange(-percent_yes)
## # A tibble: 199 × 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 Sao Tome and Principe 1091 0.9761687
## 2 Seychelles 882 0.9750567
## 3 Djibouti 1599 0.9612258
## 4 Guinea Bissau 1539 0.9603639
## 5 Timor-Leste 327 0.9571865
## 6 Mauritius 1832 0.9497817
## 7 Zimbabwe 1362 0.9493392
## 8 Comoros 1134 0.9470899
## 9 United Arab Emirates 1935 0.9467700
## 10 Mozambique 1702 0.9465335
## # ... with 189 more rows
# Filter out countries with fewer than 100 votes
by_country %>%
arrange(percent_yes) %>%
filter(total >= 100)
## # A tibble: 196 × 3
## country total percent_yes
## <chr> <int> <dbl>
## 1 United States of America 2569 0.2693655
## 2 Palau 370 0.3378378
## 3 Israel 2381 0.3406132
## 4 Federal Republic of Germany 1075 0.3972093
## 5 United Kingdom of Great Britain and Northern Ireland 2559 0.4165690
## 6 France 2528 0.4264241
## 7 Micronesia (Federated States of) 724 0.4419890
## 8 Marshall Islands 757 0.4914135
## 9 Belgium 2569 0.4920202
## 10 Canada 2577 0.5079550
## # ... with 186 more rows
Chapter 2 - Visualization with ggplot2
General ggplot2 background - better exploration of the trends over time:
Visualizing by country - see for an individual country or groups of countries:
Faceting to show multiple plots:
Example code includes:
# Define by_year
by_year <- votes_processed %>%
group_by(year) %>%
summarize(total = n(),
percent_yes = mean(vote == 1)
)
# Load the ggplot2 package
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
# Create line plot
ggplot(by_year, aes(x=year, y=percent_yes)) +
geom_line()
ggplot(by_year, aes(year, percent_yes)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess'
# Group by year and country: by_year_country
by_year_country <- votes_processed %>%
group_by(year, country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1)
)
# Print by_year_country
by_year_country
## Source: local data frame [4,717 x 4]
## Groups: year [?]
##
## year country total percent_yes
## <dbl> <chr> <int> <dbl>
## 1 1947 Afghanistan 34 0.3823529
## 2 1947 Argentina 38 0.5789474
## 3 1947 Australia 38 0.5526316
## 4 1947 Belarus 38 0.5000000
## 5 1947 Belgium 38 0.6052632
## 6 1947 Bolivia (Plurinational State of) 37 0.5945946
## 7 1947 Brazil 38 0.6578947
## 8 1947 Canada 38 0.6052632
## 9 1947 Chile 38 0.6578947
## 10 1947 Colombia 35 0.5428571
## # ... with 4,707 more rows
# Create a filtered version: UK_by_year
UK_by_year <- by_year_country %>%
filter(country == "United Kingdom of Great Britain and Northern Ireland")
# Line plot of percent_yes over time for UK only
ggplot(UK_by_year, aes(x=year, y=percent_yes)) + geom_line()
# Vector of four countries to examine
countries <- c("United States of America",
"United Kingdom of Great Britain and Northern Ireland",
"France",
"India"
)
# Filter by_year_country: filtered_4_countries
filtered_4_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes in four countries
ggplot(filtered_4_countries, aes(x=year, y=percent_yes, color=country)) +
geom_line()
countries <- c("United States of America",
"United Kingdom of Great Britain and Northern Ireland",
"France",
"Japan",
"Brazil",
"India"
)
# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(x=year, y=percent_yes)) +
geom_line() +
facet_wrap(~ country)
ggplot(filtered_6_countries, aes(year, percent_yes)) +
geom_line() +
facet_wrap(~ country, scale="free_y")
countries <- c("United States of America",
"United Kingdom of Great Britain and Northern Ireland",
"France",
"Japan",
"Brazil",
"India",
"Canada",
"Mexico",
"Israel"
)
# Filtered by_year_country: filtered_countries
filtered_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_countries, aes(year, percent_yes)) +
geom_line() +
facet_wrap(~ country, scales = "free_y")
Chapter 3 - Tidy modeling with broom
Linear regression - quantifying trends (best-fit-lines):
Tidying models with broom:
Nesting for multiple models:
Fitting multiple models to the nested data:
Working with many tidy models:
Example code includes:
# Percentage of yes votes from the US by year: US_by_year
US_by_year <- by_year_country %>%
filter(country == "United States of America")
# Print the US_by_year data
US_by_year
## Source: local data frame [34 x 4]
## Groups: year [34]
##
## year country total percent_yes
## <dbl> <chr> <int> <dbl>
## 1 1947 United States of America 38 0.7105263
## 2 1949 United States of America 64 0.2812500
## 3 1951 United States of America 25 0.4000000
## 4 1953 United States of America 26 0.5000000
## 5 1955 United States of America 37 0.6216216
## 6 1957 United States of America 34 0.6470588
## 7 1959 United States of America 54 0.4259259
## 8 1961 United States of America 75 0.5066667
## 9 1963 United States of America 32 0.5000000
## 10 1965 United States of America 41 0.3658537
## # ... with 24 more rows
# Perform a linear regression of percent_yes by year: US_fit
US_fit <- lm(percent_yes ~ year, data=US_by_year)
# Perform summary() on the US_fit object
summary(US_fit)
##
## Call:
## lm(formula = percent_yes ~ year, data = US_by_year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.222557 -0.080540 -0.008592 0.081983 0.194232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.6724804 1.8378722 6.895 8.36e-08 ***
## year -0.0062435 0.0009282 -6.727 1.35e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1062 on 32 degrees of freedom
## Multiple R-squared: 0.5858, Adjusted R-squared: 0.5728
## F-statistic: 45.25 on 1 and 32 DF, p-value: 1.348e-07
# Call the tidy() function on the US_fit object
broom::tidy(US_fit)
## term estimate std.error statistic p.value
## 1 (Intercept) 12.672480427 1.8378722388 6.895191 8.360047e-08
## 2 year -0.006243547 0.0009281727 -6.726708 1.347828e-07
# Linear regression of percent_yes by year for US
US_by_year <- by_year_country %>%
filter(country == "United States of America")
US_fit <- lm(percent_yes ~ year, US_by_year)
# Fit model for the United Kingdom
UK_by_year <- by_year_country %>%
filter(country == "United Kingdom of Great Britain and Northern Ireland")
UK_fit <- lm(percent_yes ~ year, UK_by_year)
# Create US_tidied and UK_tidied
US_tidied <- broom::tidy(US_fit)
UK_tidied <- broom::tidy(UK_fit)
# Combine the two tidied models
bind_rows(US_tidied, UK_tidied)
## term estimate std.error statistic p.value
## 1 (Intercept) 12.672480427 1.8378722388 6.895191 8.360047e-08
## 2 year -0.006243547 0.0009281727 -6.726708 1.347828e-07
## 3 (Intercept) -3.237477542 1.9542206633 -1.656659 1.073629e-01
## 4 year 0.001854637 0.0009869317 1.879195 6.935175e-02
# Seems like a HORRIBLE function; messed up all the data unless it was 1) ungrouped, and 2) arranged by the planned nesting variables
# Nest all columns besides country
by_year_country %>%
ungroup() %>%
arrange(country) %>%
tidyr::nest(-country)
## # A tibble: 199 × 2
## country data
## <chr> <list>
## 1 Afghanistan <tibble [34 × 3]>
## 2 Albania <tibble [29 × 3]>
## 3 Algeria <tibble [26 × 3]>
## 4 Andorra <tibble [11 × 3]>
## 5 Angola <tibble [19 × 3]>
## 6 Antigua and Barbuda <tibble [17 × 3]>
## 7 Argentina <tibble [34 × 3]>
## 8 Armenia <tibble [12 × 3]>
## 9 Australia <tibble [34 × 3]>
## 10 Austria <tibble [29 × 3]>
## # ... with 189 more rows
nested <- by_year_country %>%
ungroup() %>%
arrange(country) %>%
tidyr::nest(-country)
# Print the nested data for Brazil
nested$data[nested$country == "Brazil"]
## [[1]]
## # A tibble: 34 × 3
## year total percent_yes
## <dbl> <int> <dbl>
## 1 1947 38 0.6578947
## 2 1949 64 0.4687500
## 3 1951 25 0.6400000
## 4 1953 26 0.7307692
## 5 1955 37 0.7297297
## 6 1957 34 0.7352941
## 7 1959 54 0.5370370
## 8 1961 76 0.5526316
## 9 1963 32 0.7812500
## 10 1965 41 0.6097561
## # ... with 24 more rows
# Unnest the data column to return it to its original form
tidyr::unnest(nested, data)
## # A tibble: 4,717 × 4
## country year total percent_yes
## <chr> <dbl> <int> <dbl>
## 1 Afghanistan 1947 34 0.3823529
## 2 Afghanistan 1949 51 0.6078431
## 3 Afghanistan 1951 25 0.7600000
## 4 Afghanistan 1953 26 0.7692308
## 5 Afghanistan 1955 37 0.7297297
## 6 Afghanistan 1957 34 0.5294118
## 7 Afghanistan 1959 54 0.6111111
## 8 Afghanistan 1961 76 0.6052632
## 9 Afghanistan 1963 32 0.7812500
## 10 Afghanistan 1965 40 0.8500000
## # ... with 4,707 more rows
# Perform a linear regression on each item in the data column
mdls <- purrr::map(nested$data, ~ lm(percent_yes ~ year, .))
nested %>%
mutate(model = mdls)
## # A tibble: 199 × 3
## country data model
## <chr> <list> <list>
## 1 Afghanistan <tibble [34 × 3]> <S3: lm>
## 2 Albania <tibble [29 × 3]> <S3: lm>
## 3 Algeria <tibble [26 × 3]> <S3: lm>
## 4 Andorra <tibble [11 × 3]> <S3: lm>
## 5 Angola <tibble [19 × 3]> <S3: lm>
## 6 Antigua and Barbuda <tibble [17 × 3]> <S3: lm>
## 7 Argentina <tibble [34 × 3]> <S3: lm>
## 8 Armenia <tibble [12 × 3]> <S3: lm>
## 9 Australia <tibble [34 × 3]> <S3: lm>
## 10 Austria <tibble [29 × 3]> <S3: lm>
## # ... with 189 more rows
# This one errors out for some reason (only in knitr, not in the console)
# nested %>%
# mutate(model = purrr::map(data, ~ lm(percent_yes ~ year, .)))
# Add another mutate that applies tidy() to each model
tidyModel <- purrr::map(mdls, ~broom::tidy(.))
nested %>%
mutate(model = mdls) %>%
mutate(tidied = tidyModel)
## # A tibble: 199 × 4
## country data model tidied
## <chr> <list> <list> <list>
## 1 Afghanistan <tibble [34 × 3]> <S3: lm> <data.frame [2 × 5]>
## 2 Albania <tibble [29 × 3]> <S3: lm> <data.frame [2 × 5]>
## 3 Algeria <tibble [26 × 3]> <S3: lm> <data.frame [2 × 5]>
## 4 Andorra <tibble [11 × 3]> <S3: lm> <data.frame [2 × 5]>
## 5 Angola <tibble [19 × 3]> <S3: lm> <data.frame [2 × 5]>
## 6 Antigua and Barbuda <tibble [17 × 3]> <S3: lm> <data.frame [2 × 5]>
## 7 Argentina <tibble [34 × 3]> <S3: lm> <data.frame [2 × 5]>
## 8 Armenia <tibble [12 × 3]> <S3: lm> <data.frame [2 × 5]>
## 9 Australia <tibble [34 × 3]> <S3: lm> <data.frame [2 × 5]>
## 10 Austria <tibble [29 × 3]> <S3: lm> <data.frame [2 × 5]>
## # ... with 189 more rows
# This one errors out for some reason (only in knitr, not in the console)
# nested %>%
# mutate(model = purrr::map(data, ~ lm(percent_yes ~ year, data = .))) %>%
# mutate(tidied = purrr::map(model, ~ broom::tidy(.)))
# Add one more step that unnests the tidied column
country_coefficients <- nested %>%
mutate(model = mdls,
tidied = tidyModel
) %>%
tidyr::unnest(tidied)
# Samer erroring out issue in knitr . . .
# country_coefficients <- nested %>%
# mutate(model = purrr::map(data, ~ lm(percent_yes ~ year, data = .)),
# tidied = purrr::map(model, broom::tidy)
# ) %>%
# tidyr::unnest(tidied)
# Print the resulting country_coefficients variable
country_coefficients
## # A tibble: 397 × 6
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan (Intercept) -11.063084650 1.4705189228 -7.52325215
## 2 Afghanistan year 0.006009299 0.0007426499 8.09169837
## 3 Albania (Intercept) 3.360559305 3.3085971803 1.01570518
## 4 Albania year -0.001345460 0.0016667404 -0.80724056
## 5 Algeria (Intercept) -5.461121731 1.7452792997 -3.12908182
## 6 Algeria year 0.003193022 0.0008778821 3.63718725
## 7 Andorra (Intercept) -0.358359014 4.8835752846 -0.07338046
## 8 Andorra year 0.000493452 0.0024381183 0.20239049
## 9 Angola (Intercept) 3.093752452 2.0124923762 1.53727412
## 10 Angola year -0.001090811 0.0010087529 -1.08134636
## # ... with 387 more rows, and 1 more variables: p.value <dbl>
# Print the country_coefficients dataset
country_coefficients
## # A tibble: 397 × 6
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan (Intercept) -11.063084650 1.4705189228 -7.52325215
## 2 Afghanistan year 0.006009299 0.0007426499 8.09169837
## 3 Albania (Intercept) 3.360559305 3.3085971803 1.01570518
## 4 Albania year -0.001345460 0.0016667404 -0.80724056
## 5 Algeria (Intercept) -5.461121731 1.7452792997 -3.12908182
## 6 Algeria year 0.003193022 0.0008778821 3.63718725
## 7 Andorra (Intercept) -0.358359014 4.8835752846 -0.07338046
## 8 Andorra year 0.000493452 0.0024381183 0.20239049
## 9 Angola (Intercept) 3.093752452 2.0124923762 1.53727412
## 10 Angola year -0.001090811 0.0010087529 -1.08134636
## # ... with 387 more rows, and 1 more variables: p.value <dbl>
# Filter for only the slope terms
country_coefficients %>%
filter(term == "year")
## # A tibble: 198 × 6
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan year 0.006009299 0.0007426499 8.0916984
## 2 Albania year -0.001345460 0.0016667404 -0.8072406
## 3 Algeria year 0.003193022 0.0008778821 3.6371873
## 4 Andorra year 0.000493452 0.0024381183 0.2023905
## 5 Angola year -0.001090811 0.0010087529 -1.0813464
## 6 Antigua and Barbuda year 0.001079916 0.0010590399 1.0197121
## 7 Argentina year 0.005152270 0.0010610352 4.8558902
## 8 Armenia year -0.003570723 0.0035892632 -0.9948346
## 9 Australia year 0.002553740 0.0010859947 2.3515218
## 10 Austria year 0.002840993 0.0008664018 3.2790715
## # ... with 188 more rows, and 1 more variables: p.value <dbl>
# Filter for only the slope terms
slope_terms <- country_coefficients %>%
filter(term == "year")
# Add p.adjusted column, then filter
slope_terms %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < 0.05)
## # A tibble: 62 × 7
## country term estimate std.error
## <chr> <chr> <dbl> <dbl>
## 1 Afghanistan year 0.006009299 0.0007426499
## 2 Argentina year 0.005152270 0.0010610352
## 3 Barbados year 0.005616368 0.0013347331
## 4 Belarus year 0.003912506 0.0007585622
## 5 Belgium year 0.003186372 0.0007630472
## 6 Bolivia (Plurinational State of) year 0.005803654 0.0009657579
## 7 Brazil year 0.006108871 0.0008167495
## 8 Cambodia year 0.006792013 0.0011544253
## 9 Central African Republic year 0.005567740 0.0013039928
## 10 Chile year 0.006776937 0.0008220202
## # ... with 52 more rows, and 3 more variables: statistic <dbl>,
## # p.value <dbl>, p.adjusted <dbl>
# Filter by adjusted p-values
filtered_countries <- country_coefficients %>%
filter(term == "year") %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < .05)
# Sort for the countries increasing most quickly
filtered_countries %>%
arrange(desc(estimate))
## # A tibble: 62 × 7
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 South Africa year 0.011861365 0.0014004289 8.469809
## 2 Kazakhstan year 0.010955741 0.0019482401 5.623404
## 3 Yemen Arab Republic year 0.010854882 0.0015869058 6.840281
## 4 Kyrgyzstan year 0.009725462 0.0009884060 9.839541
## 5 Malawi year 0.009087765 0.0018112478 5.017406
## 6 Dominican Republic year 0.008055482 0.0009138578 8.814809
## 7 Portugal year 0.007996968 0.0017114569 4.672609
## 8 Honduras year 0.007721191 0.0009211022 8.382556
## 9 Peru year 0.007301189 0.0009763560 7.477999
## 10 Nicaragua year 0.007077883 0.0010715994 6.604971
## # ... with 52 more rows, and 2 more variables: p.value <dbl>,
## # p.adjusted <dbl>
# Sort for the countries decreasing most quickly
filtered_countries %>%
arrange(estimate)
## # A tibble: 62 × 7
## country term estimate std.error statistic
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Republic of Korea year -0.009256217 0.0015111366 -6.125334
## 2 Israel year -0.006859149 0.0011717864 -5.853583
## 3 United States of America year -0.006243547 0.0009281727 -6.726708
## 4 Belgium year 0.003186372 0.0007630472 4.175852
## 5 Guinea year 0.003623915 0.0008324455 4.353336
## 6 Morocco year 0.003800921 0.0008601889 4.418705
## 7 Belarus year 0.003912506 0.0007585622 5.157792
## 8 Iran (Islamic Republic of) year 0.003914836 0.0008554901 4.576133
## 9 Congo year 0.003967778 0.0009220262 4.303324
## 10 Sudan year 0.003991321 0.0009613509 4.151784
## # ... with 52 more rows, and 2 more variables: p.value <dbl>,
## # p.adjusted <dbl>
Chapter 4 - Joining and Tidying
Joining datasets - bringing in the descriptions for each type of roll call vote:
Tidy data - creating graphs faceted by issue, and with lines for a few key countries:
Tidy modeling by topic and country - running linear models by country and topic:
Example code includes:
# The dataset unvotes::un_roll_call_issues is 4,951 x 3 [rcid-short_name-issue]
str(unvotes::un_roll_call_issues) # 4,951x3
## Classes 'tbl_df', 'tbl' and 'data.frame': 4951 obs. of 3 variables:
## $ rcid : num 30 34 77 9002 9003 ...
## $ short_name: chr "me" "me" "me" "me" ...
## $ issue : chr "Palestinian conflict" "Palestinian conflict" "Palestinian conflict" "Palestinian conflict" ...
table(unvotes::un_roll_call_issues$short_name) # Has the 6 key issues we are seeking
##
## co di ec hr me nu
## 971 859 461 901 1047 712
sum(table(unvotes::un_roll_call_issues$short_name)) # 4,951
## [1] 4951
nrow(distinct(select(unvotes::un_roll_call_issues, rcid))) # 3,813 (there are duplicates by rcid)
## [1] 3813
tmpData <- unvotes::un_roll_call_issues %>%
mutate(dummy=1) %>%
select(rcid, short_name, dummy) %>%
tidyr::spread(key=short_name, value=dummy, fill=0)
str(tmpData) # 3,813 x 7 (rcid-6 issues)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3813 obs. of 7 variables:
## $ rcid: num 6 8 11 18 19 24 26 27 28 29 ...
## $ co : num 0 0 1 0 0 0 1 1 1 1 ...
## $ di : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ec : num 0 1 0 1 1 1 0 0 0 0 ...
## $ hr : num 1 0 0 0 0 0 0 0 0 0 ...
## $ me : num 0 0 0 0 0 0 0 0 0 0 ...
## $ nu : num 0 0 0 0 0 0 0 0 0 0 ...
tmpData %>%
select(-rcid) %>%
rowSums() %>%
table() # 2,836 are 1 ; 816 are 2 ; 161 are 3
## .
## 1 2 3
## 2836 816 161
# The dataset unvotes::un_roll_call_issues is 5,356 x 9 [rcid-session-importantvote-date-unres-amend-para-short-descr]
str(unvotes::un_roll_calls) # 5,356 x 9
## Classes 'tbl_df', 'tbl' and 'data.frame': 5356 obs. of 9 variables:
## $ rcid : num 3 4 5 6 7 8 9 10 11 12 ...
## $ session : num 1 1 1 1 1 1 1 1 1 1 ...
## $ importantvote: num 0 0 0 0 0 0 0 0 0 0 ...
## $ date : Date, format: "1946-01-01" "1946-01-02" ...
## $ unres : chr "R/1/66" "R/1/79" "R/1/98" "R/1/107" ...
## $ amend : num 1 0 0 0 1 1 0 1 0 1 ...
## $ para : num 0 0 0 0 0 0 0 1 0 1 ...
## $ short : chr "AMENDMENTS, RULES OF PROCEDURE" "SECURITY COUNCIL ELECTIONS" "VOTING PROCEDURE" "DECLARATION OF HUMAN RIGHTS" ...
## $ descr : chr "TO ADOPT A CUBAN AMENDMENT TO THE UK PROPOSAL REFERRING THE PROVISIONAL RULES OF PROCEDURE AND ANY AMENDMENTS THEREOF TO THE 6T"| __truncated__ "TO ADOPT A USSR PROPOSAL ADJOURNING DEBATE ON AND POSTPONINGELECTIONS OF THE NON-PERMANENT MEMBERS OF THE SECURITY COUNCIL, TO "| __truncated__ "TO ADOPT THE KOREAN PROPOSAL THAT INVALID BALLOTS BE INCLUDED IN THE TOTAL NUMBER OF \\MEMBERS PRESENT AND VOTING\\\\, IN CALCU"| __truncated__ "TO ADOPT A CUBAN PROPOSAL (A/3-C) THAT AN ITEM ON A DECLARATION OF THE RIGHTS AND DUTIES OF MAN BE TABLED." ...
nrow(distinct(select(unvotes::un_roll_calls, rcid))) == nrow(unvotes::un_roll_calls) # TRUE (no duplicates)
## [1] TRUE
# Combine the datasets to create "descriptions" which should have 10 columns (rcid-session-date-unres-6 numerics)
# The dataset "descriptions" should have only the even numbered sessions
descriptions <- unvotes::un_roll_calls %>%
select(rcid, session, date, unres) %>%
left_join(tmpData, by="rcid") %>%
filter(session %% 2 == 0)
numVars <- c("me", "nu", "di", "hr", "co", "ec")
descriptions[, numVars][is.na(descriptions[, numVars])] <- 0
# Print the votes_processed dataset
votes_processed
## # A tibble: 351,529 × 6
## rcid session vote ccode year country
## <dbl> <dbl> <dbl> <int> <dbl> <chr>
## 1 46 2 1 150 1947 Paraguay
## 2 46 2 1 91 1947 Honduras
## 3 46 2 1 212 1947 Luxembourg
## 4 46 2 3 290 1947 Poland
## 5 46 2 1 900 1947 Australia
## 6 46 2 1 140 1947 Brazil
## 7 46 2 2 530 1947 Ethiopia
## 8 46 2 1 840 1947 Philippines
## 9 46 2 3 365 1947 Russian Federation
## 10 46 2 1 160 1947 Argentina
## # ... with 351,519 more rows
# Print the descriptions dataset
descriptions
## # A tibble: 2,590 × 10
## rcid session date unres co di ec hr me nu
## <dbl> <dbl> <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 46 2 1947-09-04 R/2/299 0 0 0 0 0 0
## 2 47 2 1947-10-05 R/2/355 0 0 0 1 0 0
## 3 48 2 1947-10-06 R/2/461 0 0 0 0 0 0
## 4 49 2 1947-10-06 R/2/463 0 0 0 0 0 0
## 5 50 2 1947-10-06 R/2/465 0 0 0 0 0 0
## 6 51 2 1947-10-02 R/2/561 1 0 0 0 0 0
## 7 52 2 1947-11-06 R/2/650 1 0 0 0 0 0
## 8 53 2 1947-11-06 R/2/651 1 0 0 0 0 0
## 9 54 2 1947-11-06 R/2/651 1 0 0 0 0 0
## 10 55 2 1947-11-06 R/2/667 1 0 0 0 0 0
## # ... with 2,580 more rows
# Join them together based on the "rcid" and "session" columns
votes_joined <- inner_join(votes_processed, descriptions, by=c("rcid", "session"))
votes_joined # 353,720 x 14
## # A tibble: 351,529 × 14
## rcid session vote ccode year country date unres
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <date> <chr>
## 1 46 2 1 150 1947 Paraguay 1947-09-04 R/2/299
## 2 46 2 1 91 1947 Honduras 1947-09-04 R/2/299
## 3 46 2 1 212 1947 Luxembourg 1947-09-04 R/2/299
## 4 46 2 3 290 1947 Poland 1947-09-04 R/2/299
## 5 46 2 1 900 1947 Australia 1947-09-04 R/2/299
## 6 46 2 1 140 1947 Brazil 1947-09-04 R/2/299
## 7 46 2 2 530 1947 Ethiopia 1947-09-04 R/2/299
## 8 46 2 1 840 1947 Philippines 1947-09-04 R/2/299
## 9 46 2 3 365 1947 Russian Federation 1947-09-04 R/2/299
## 10 46 2 1 160 1947 Argentina 1947-09-04 R/2/299
## # ... with 351,519 more rows, and 6 more variables: co <dbl>, di <dbl>,
## # ec <dbl>, hr <dbl>, me <dbl>, nu <dbl>
# Filter for votes related to colonialism
votes_joined %>%
filter(co == 1)
## # A tibble: 60,589 × 14
## rcid session vote ccode year country date
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <date>
## 1 51 2 2 92 1947 El Salvador 1947-10-02
## 2 51 2 3 2 1947 United States of America 1947-10-02
## 3 51 2 2 713 1947 Taiwan, Province of China 1947-10-02
## 4 51 2 1 365 1947 Russian Federation 1947-10-02
## 5 51 2 2 840 1947 Philippines 1947-10-02
## 6 51 2 1 315 1947 Czechoslovakia 1947-10-02
## 7 51 2 1 450 1947 Liberia 1947-10-02
## 8 51 2 2 530 1947 Ethiopia 1947-10-02
## 9 51 2 3 560 1947 South Africa 1947-10-02
## 10 51 2 3 211 1947 Belgium 1947-10-02
## # ... with 60,579 more rows, and 7 more variables: unres <chr>, co <dbl>,
## # di <dbl>, ec <dbl>, hr <dbl>, me <dbl>, nu <dbl>
# Filter, then summarize by year: US_co_by_year
US_co_by_year <- votes_joined %>%
filter(country=="United States of America", co==1) %>%
group_by(year) %>%
summarize(percent_yes = mean(vote == 1))
# Graph the % of "yes" votes over time
ggplot(US_co_by_year, aes(x=year, y=percent_yes)) + geom_line()
# Gather the six mu/nu/di/hr/co/ec columns
votes_joined %>%
tidyr::gather(topic, has_topic, co:nu)
## # A tibble: 2,109,174 × 10
## rcid session vote ccode year country date unres
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <date> <chr>
## 1 46 2 1 150 1947 Paraguay 1947-09-04 R/2/299
## 2 46 2 1 91 1947 Honduras 1947-09-04 R/2/299
## 3 46 2 1 212 1947 Luxembourg 1947-09-04 R/2/299
## 4 46 2 3 290 1947 Poland 1947-09-04 R/2/299
## 5 46 2 1 900 1947 Australia 1947-09-04 R/2/299
## 6 46 2 1 140 1947 Brazil 1947-09-04 R/2/299
## 7 46 2 2 530 1947 Ethiopia 1947-09-04 R/2/299
## 8 46 2 1 840 1947 Philippines 1947-09-04 R/2/299
## 9 46 2 3 365 1947 Russian Federation 1947-09-04 R/2/299
## 10 46 2 1 160 1947 Argentina 1947-09-04 R/2/299
## # ... with 2,109,164 more rows, and 2 more variables: topic <chr>,
## # has_topic <dbl>
# Perform gather again, then filter
votes_gathered <- votes_joined %>%
tidyr::gather(topic, has_topic, co:nu) %>%
filter(has_topic == 1)
votes_gathered # 350,052 x 10
## # A tibble: 347,890 × 10
## rcid session vote ccode year country date
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <date>
## 1 51 2 2 92 1947 El Salvador 1947-10-02
## 2 51 2 3 2 1947 United States of America 1947-10-02
## 3 51 2 2 713 1947 Taiwan, Province of China 1947-10-02
## 4 51 2 1 365 1947 Russian Federation 1947-10-02
## 5 51 2 2 840 1947 Philippines 1947-10-02
## 6 51 2 1 315 1947 Czechoslovakia 1947-10-02
## 7 51 2 1 450 1947 Liberia 1947-10-02
## 8 51 2 2 530 1947 Ethiopia 1947-10-02
## 9 51 2 3 560 1947 South Africa 1947-10-02
## 10 51 2 3 211 1947 Belgium 1947-10-02
## # ... with 347,880 more rows, and 3 more variables: unres <chr>,
## # topic <chr>, has_topic <dbl>
# Replace the two-letter codes in topic: votes_tidied
votes_tidied <- votes_gathered %>%
mutate(topic = recode(topic,
me = "Palestinian conflict",
nu = "Nuclear weapons and nuclear material",
di = "Arms control and disarmament",
hr = "Human rights",
co = "Colonialism",
ec = "Economic development"
)
)
# Print votes_tidied
votes_tidied
## # A tibble: 347,890 × 10
## rcid session vote ccode year country date
## <dbl> <dbl> <dbl> <int> <dbl> <chr> <date>
## 1 51 2 2 92 1947 El Salvador 1947-10-02
## 2 51 2 3 2 1947 United States of America 1947-10-02
## 3 51 2 2 713 1947 Taiwan, Province of China 1947-10-02
## 4 51 2 1 365 1947 Russian Federation 1947-10-02
## 5 51 2 2 840 1947 Philippines 1947-10-02
## 6 51 2 1 315 1947 Czechoslovakia 1947-10-02
## 7 51 2 1 450 1947 Liberia 1947-10-02
## 8 51 2 2 530 1947 Ethiopia 1947-10-02
## 9 51 2 3 560 1947 South Africa 1947-10-02
## 10 51 2 3 211 1947 Belgium 1947-10-02
## # ... with 347,880 more rows, and 3 more variables: unres <chr>,
## # topic <chr>, has_topic <dbl>
# Summarize the percentage "yes" per country-year-topic
by_country_year_topic <- votes_tidied %>%
group_by(country, year, topic) %>%
summarize(total=n(), percent_yes=mean(vote == 1)) %>%
ungroup()
# Print by_country_year_topic
by_country_year_topic
## # A tibble: 26,808 × 5
## country year topic total
## <chr> <dbl> <chr> <int>
## 1 Afghanistan 1947 Colonialism 8
## 2 Afghanistan 1947 Economic development 1
## 3 Afghanistan 1947 Human rights 1
## 4 Afghanistan 1947 Palestinian conflict 6
## 5 Afghanistan 1949 Arms control and disarmament 3
## 6 Afghanistan 1949 Colonialism 22
## 7 Afghanistan 1949 Economic development 1
## 8 Afghanistan 1949 Human rights 3
## 9 Afghanistan 1949 Nuclear weapons and nuclear material 3
## 10 Afghanistan 1949 Palestinian conflict 11
## # ... with 26,798 more rows, and 1 more variables: percent_yes <dbl>
# Filter by_country_year_topic for just the US
US_by_country_year_topic <- by_country_year_topic %>%
filter(country == "United States of America")
# Plot % yes over time for the US, faceting by topic
ggplot(US_by_country_year_topic, aes(x=year, y=percent_yes)) +
geom_line() +
facet_wrap(~ topic)
# Print by_country_year_topic
by_country_year_topic
## # A tibble: 26,808 × 5
## country year topic total
## <chr> <dbl> <chr> <int>
## 1 Afghanistan 1947 Colonialism 8
## 2 Afghanistan 1947 Economic development 1
## 3 Afghanistan 1947 Human rights 1
## 4 Afghanistan 1947 Palestinian conflict 6
## 5 Afghanistan 1949 Arms control and disarmament 3
## 6 Afghanistan 1949 Colonialism 22
## 7 Afghanistan 1949 Economic development 1
## 8 Afghanistan 1949 Human rights 3
## 9 Afghanistan 1949 Nuclear weapons and nuclear material 3
## 10 Afghanistan 1949 Palestinian conflict 11
## # ... with 26,798 more rows, and 1 more variables: percent_yes <dbl>
# Fit model on the by_country_year_topic dataset
country_topic_coefficients <- by_country_year_topic %>%
tidyr::nest(-country, -topic) %>%
mutate(model = purrr::map(data, ~ lm(percent_yes ~ year, data = .)),
tidied = purrr::map(model, broom::tidy)) %>%
tidyr::unnest(tidied)
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(x): essentially perfect fit: summary may be
## unreliable
# Print country_topic_coefficients
country_topic_coefficients
## # A tibble: 2,371 × 7
## country topic term estimate
## <chr> <chr> <chr> <dbl>
## 1 Afghanistan Colonialism (Intercept) -13.759624843
## 2 Afghanistan Colonialism year 0.007369733
## 3 Afghanistan Economic development (Intercept) -9.196506325
## 4 Afghanistan Economic development year 0.005106200
## 5 Afghanistan Human rights (Intercept) -11.476390441
## 6 Afghanistan Human rights year 0.006239157
## 7 Afghanistan Palestinian conflict (Intercept) -7.265379964
## 8 Afghanistan Palestinian conflict year 0.004075877
## 9 Afghanistan Arms control and disarmament (Intercept) -13.304119332
## 10 Afghanistan Arms control and disarmament year 0.007145966
## # ... with 2,361 more rows, and 3 more variables: std.error <dbl>,
## # statistic <dbl>, p.value <dbl>
# Create country_topic_filtered
country_topic_filtered <- country_topic_coefficients %>%
filter(term == "year") %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < 0.05)
country_topic_filtered %>%
arrange(estimate)
## # A tibble: 56 × 8
## country topic term
## <chr> <chr> <chr>
## 1 Vanuatu Palestinian conflict year
## 2 Vanuatu Colonialism year
## 3 Malta Economic development year
## 4 Cyprus Human rights year
## 5 United States of America Nuclear weapons and nuclear material year
## 6 Cyprus Nuclear weapons and nuclear material year
## 7 Israel Colonialism year
## 8 Romania Human rights year
## 9 Malta Arms control and disarmament year
## 10 Cyprus Arms control and disarmament year
## # ... with 46 more rows, and 5 more variables: estimate <dbl>,
## # std.error <dbl>, statistic <dbl>, p.value <dbl>, p.adjusted <dbl>
country_topic_filtered %>%
arrange(desc(estimate))
## # A tibble: 56 × 8
## country topic term estimate std.error
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Malawi Palestinian conflict year 0.02008349 0.002890454
## 2 Nepal Palestinian conflict year 0.01868055 0.002207085
## 3 Barbados Palestinian conflict year 0.01658844 0.002954811
## 4 South Africa Economic development year 0.01657445 0.001879572
## 5 Malawi Colonialism year 0.01497103 0.002802337
## 6 Mongolia Economic development year 0.01394112 0.002780405
## 7 Myanmar Palestinian conflict year 0.01345118 0.002525473
## 8 South Africa Colonialism year 0.01299728 0.001447861
## 9 Portugal Colonialism year 0.01228600 0.002440573
## 10 Cuba Arms control and disarmament year 0.01129025 0.002260246
## # ... with 46 more rows, and 3 more variables: statistic <dbl>,
## # p.value <dbl>, p.adjusted <dbl>
Chapter 1 - Flight Data
Review xts fundamentals - time series data, consisting of one or more units over many periods:
Manipulating and visualizing data:
Saving and exporting time series data in R:
Example code includes:
# Create the flights dataset
flightsTotalFlights <- "8912 ; 8418 ; 9637 ; 9363 ; 9360 ; 9502 ; 9992 ; 10173 ; 9417 ; 9762 ; 9558 ; 9429 ; 9000 ; 8355 ; 9501 ; 9351 ; 9542 ; 9552 ; 9896 ; 9909 ; 8845 ; 9100 ; 8496 ; 8146 ; 8228 ; 8016 ; 8869 ; 8793 ; 8987 ; 8751 ; 8960 ; 9140 ; 8293 ; 8809 ; 8345 ; 8024 ; 8168 ; 7714 ; 9195 ; 9318 ; 9580 ; 9750 ; 10291 ; 10392 ; 9290 ; 9702 ; 9075 ; 8890 ; 8283 ; 7755 ; 9322 ; 9374 ; 9534 ; 9662 ; 10098 ; 9932 ; 9105 ; 9673 ; 9020 ; 8872 ; 8841 ; 8383 ; 9980 ; 10005 ; 10243 ; 10544 ; 10837 ; 10728 ; 9724 ; 10161 ; 9463 ; 9103"
flightsDelayFlights <-"1989 ; 1918 ; 2720 ; 1312 ; 1569 ; 1955 ; 2256 ; 2108 ; 1708 ; 1897 ; 1785 ; 2483 ; 1965 ; 1511 ; 2139 ; 2568 ; 3391 ; 2649 ; 2336 ; 2653 ; 2079 ; 1827 ; 1151 ; 889 ; 1254 ; 857 ; 1606 ; 1142 ; 1686 ; 1970 ; 2121 ; 1923 ; 1490 ; 1358 ; 1240 ; 1470 ; 1134 ; 1413 ; 2089 ; 1809 ; 2009 ; 2748 ; 3045 ; 2278 ; 1434 ; 1148 ; 1044 ; 2249 ; 1825 ; 1571 ; 1597 ; 1544 ; 1899 ; 2279 ; 2652 ; 1984 ; 1288 ; 2163 ; 1602 ; 1912 ; 1970 ; 2739 ; 2232 ; 1895 ; 1878 ; 2488 ; 2356 ; 2399 ; 1622 ; 1471 ; 1370 ; 1826"
flightsCancelFlights <- "279 ; 785 ; 242 ; 58 ; 102 ; 157 ; 222 ; 138 ; 144 ; 131 ; 99 ; 678 ; 904 ; 654 ; 153 ; 207 ; 198 ; 226 ; 208 ; 698 ; 135 ; 99 ; 79 ; 72 ; 107 ; 62 ; 72 ; 39 ; 54 ; 118 ; 89 ; 98 ; 69 ; 624 ; 90 ; 101 ; 81 ; 479 ; 218 ; 92 ; 58 ; 118 ; 150 ; 55 ; 73 ; 31 ; 55 ; 223 ; 707 ; 593 ; 191 ; 65 ; 141 ; 141 ; 181 ; 65 ; 69 ; 82 ; 51 ; 44 ; 658 ; 1123 ; 238 ; 68 ; 79 ; 138 ; 85 ; 97 ; 45 ; 57 ; 50 ; 77"
flightsDivertFlights <- "9 ; 23 ; 32 ; 7 ; 8 ; 5 ; 10 ; 20 ; 6 ; 9 ; 2 ; 6 ; 11 ; 7 ; 16 ; 10 ; 13 ; 15 ; 8 ; 17 ; 8 ; 1 ; 5 ; 2 ; 12 ; 5 ; 4 ; 1 ; 4 ; 12 ; 10 ; 6 ; 6 ; 7 ; 2 ; 10 ; 13 ; 20 ; 12 ; 6 ; 9 ; 17 ; 20 ; 9 ; 9 ; 6 ; 9 ; 18 ; 36 ; 13 ; 3 ; 5 ; 7 ; 6 ; 13 ; 7 ; 9 ; 9 ; 3 ; 10 ; 10 ; 20 ; 28 ; 10 ; 17 ; 7 ; 4 ; 23 ; 6 ; 10 ; 6 ; 10"
flightsDate <- "2010-01-01 ; 2010-02-01 ; 2010-03-01 ; 2010-04-01 ; 2010-05-01 ; 2010-06-01 ; 2010-07-01 ; 2010-08-01 ; 2010-09-01 ; 2010-10-01 ; 2010-11-01 ; 2010-12-01 ; 2011-01-01 ; 2011-02-01 ; 2011-03-01 ; 2011-04-01 ; 2011-05-01 ; 2011-06-01 ; 2011-07-01 ; 2011-08-01 ; 2011-09-01 ; 2011-10-01 ; 2011-11-01 ; 2011-12-01 ; 2012-01-01 ; 2012-02-01 ; 2012-03-01 ; 2012-04-01 ; 2012-05-01 ; 2012-06-01 ; 2012-07-01 ; 2012-08-01 ; 2012-09-01 ; 2012-10-01 ; 2012-11-01 ; 2012-12-01 ; 2013-01-01 ; 2013-02-01 ; 2013-03-01 ; 2013-04-01 ; 2013-05-01 ; 2013-06-01 ; 2013-07-01 ; 2013-08-01 ; 2013-09-01 ; 2013-10-01 ; 2013-11-01 ; 2013-12-01 ; 2014-01-01 ; 2014-02-01 ; 2014-03-01 ; 2014-04-01 ; 2014-05-01 ; 2014-06-01 ; 2014-07-01 ; 2014-08-01 ; 2014-09-01 ; 2014-10-01 ; 2014-11-01 ; 2014-12-01 ; 2015-01-01 ; 2015-02-01 ; 2015-03-01 ; 2015-04-01 ; 2015-05-01 ; 2015-06-01 ; 2015-07-01 ; 2015-08-01 ; 2015-09-01 ; 2015-10-01 ; 2015-11-01 ; 2015-12-01"
flights <- data.frame(total_flights=as.numeric(strsplit(flightsTotalFlights, " ; ")[[1]]),
delay_flights=as.numeric(strsplit(flightsDelayFlights, " ; ")[[1]]),
cancel_flights=as.numeric(strsplit(flightsCancelFlights, " ; ")[[1]]),
divert_flights=as.numeric(strsplit(flightsDivertFlights, " ; ")[[1]]),
date=as.character(strsplit(flightsDate, " ; ")[[1]]),
stringsAsFactors=FALSE
)
#View the structure of the flights data
str(flights)
## 'data.frame': 72 obs. of 5 variables:
## $ total_flights : num 8912 8418 9637 9363 9360 ...
## $ delay_flights : num 1989 1918 2720 1312 1569 ...
## $ cancel_flights: num 279 785 242 58 102 157 222 138 144 131 ...
## $ divert_flights: num 9 23 32 7 8 5 10 20 6 9 ...
## $ date : chr "2010-01-01" "2010-02-01" "2010-03-01" "2010-04-01" ...
#Examine the first five rows of the flights data
head(flights, n = 5)
## total_flights delay_flights cancel_flights divert_flights date
## 1 8912 1989 279 9 2010-01-01
## 2 8418 1918 785 23 2010-02-01
## 3 9637 2720 242 32 2010-03-01
## 4 9363 1312 58 7 2010-04-01
## 5 9360 1569 102 8 2010-05-01
#Identify class of the column containing date information
class(flights$date)
## [1] "character"
# Load the xts package
library(xts)
## Warning: package 'xts' was built under R version 3.2.5
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following object is masked from 'package:data.table':
##
## last
## The following objects are masked from 'package:dplyr':
##
## first, last
# Convert date column to a time-based class
flights$date <- as.Date(flights$date)
# Convert flights to an xts object using as.xts
flights_xts <- as.xts(flights[ , -5], order.by = flights$date)
# Check the class of flights_xts
class(flights_xts)
## [1] "xts" "zoo"
# Examine the first five lines of flights_xts
head(flights_xts, n=5)
## total_flights delay_flights cancel_flights divert_flights
## 2010-01-01 8912 1989 279 9
## 2010-02-01 8418 1918 785 23
## 2010-03-01 9637 2720 242 32
## 2010-04-01 9363 1312 58 7
## 2010-05-01 9360 1569 102 8
# Identify the periodicity of flights_xts
periodicity(flights_xts)
## Monthly periodicity from 2010-01-01 to 2015-12-01
# Identify the number of periods in flights_xts
nmonths(flights_xts)
## [1] 72
# Find data on flights arriving in BOS in June 2014
flights_xts["2014-06-01"]
## total_flights delay_flights cancel_flights divert_flights
## 2014-06-01 9662 2279 141 6
# Use plot.xts() to view total monthly flights into BOS over time
plot.xts(flights_xts$total_flights)
# Use plot.xts() to view monthly delayed flights into BOS over time
plot.xts(flights_xts$delay_flights)
# Use plot.zoo() to view all four columns of data in their own panels
labels <- c("Total", "Delay", "Cancel", "Divert")
plot.zoo(flights_xts, plot.type = "multiple", ylab = labels)
# Use plot.zoo() to view all four columns of data in one panel
lty <- 1:4
plot.zoo(flights_xts, plot.type = "single", lty = lty)
legend("right", lty = lty, legend = labels)
# Calculate percentage of flights delayed each month: pct_delay
flights_xts$pct_delay <- (flights_xts$delay_flights / flights_xts$total_flights) * 100
# Use plot.xts() to view pct_delay over time
plot.xts(flights_xts$pct_delay)
# Calculate percentage of flights cancelled each month: pct_cancel
flights_xts$pct_cancel <- (flights_xts$cancel_flights / flights_xts$total_flights) * 100
# Calculate percentage of flights diverted each month: pct_divert
flights_xts$pct_divert <- (flights_xts$divert_flights / flights_xts$total_flights) * 100
# Use plot.zoo() to view all three trends over time
plot.zoo(x = flights_xts[ , c("pct_delay", "pct_cancel", "pct_divert")])
# Save your xts object to rds file using saveRDS
saveRDS(object = flights_xts, file = "flights_xts.rds")
# Read your flights_xts data from the rds file
flights_xts2 <- readRDS("flights_xts.rds")
# Check the class of your new flights_xts2 object
class(flights_xts2)
## [1] "xts" "zoo"
# Examine the first five rows of your new flights_xts2 object
head(flights_xts2, n=5)
## total_flights delay_flights cancel_flights divert_flights
## 2010-01-01 8912 1989 279 9
## 2010-02-01 8418 1918 785 23
## 2010-03-01 9637 2720 242 32
## 2010-04-01 9363 1312 58 7
## 2010-05-01 9360 1569 102 8
## pct_delay pct_cancel pct_divert
## 2010-01-01 22.31822 3.1306104 0.10098743
## 2010-02-01 22.78451 9.3252554 0.27322404
## 2010-03-01 28.22455 2.5111549 0.33205354
## 2010-04-01 14.01260 0.6194596 0.07476236
## 2010-05-01 16.76282 1.0897436 0.08547009
# Export your xts object to a csv file using write.zoo
write.zoo(flights_xts, file = "flights_xts.csv", sep = ",")
# Open your saved object using read.zoo
flights2 <- read.zoo("flights_xts.csv", sep = ",", FUN = as.Date, header = TRUE, index.column = 1)
# Encode your new object back into xts
flights_xts2 <- as.xts(flights2)
# Examine the first five rows of your new flights_xts2 object
head(flights_xts2, n=5)
## total_flights delay_flights cancel_flights divert_flights
## 2010-01-01 8912 1989 279 9
## 2010-02-01 8418 1918 785 23
## 2010-03-01 9637 2720 242 32
## 2010-04-01 9363 1312 58 7
## 2010-05-01 9360 1569 102 8
## pct_delay pct_cancel pct_divert
## 2010-01-01 22.31822 3.1306104 0.10098743
## 2010-02-01 22.78451 9.3252554 0.27322404
## 2010-03-01 28.22455 2.5111549 0.33205354
## 2010-04-01 14.01260 0.6194596 0.07476236
## 2010-05-01 16.76282 1.0897436 0.08547009
Chapter 2 - Weather Data
Merging using rbind() - since xts objects are already ordered by time, rbind() outputs will also be ordered by time:
Merging time series data by column:
Time series data workflows:
Example code includes:
# Cached to avoid multiple pings to this server
allWeather <- data.frame()
for (getYear in 2007:2015) {
testWeather <- weatherData::getWeatherForYear(station_id="BOS", year=getYear)
# mutate does not accept input variable "Date" as a POSIXlt; convert it outside dplyr
testWeather$date <- as.Date(testWeather$Date)
testWeather <- testWeather %>%
select(-Date) %>%
mutate(min=Min_TemperatureF, mean=Mean_TemperatureF, max=Max_TemperatureF) %>%
select(min, mean, max, date)
allWeather <- rbind(allWeather, testWeather)
}
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2007/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2007&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2007-01-01 to 2007-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2008/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2008&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 366 records for 2008-01-01 to 2008-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2009/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2009&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2009-01-01 to 2009-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2010/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2010&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2010-01-01 to 2010-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2011/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2011&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2011-01-01 to 2011-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2012/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2012&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 366 records for 2012-01-01 to 2012-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2013/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2013&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2013-01-01 to 2013-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2014/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2014&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2014-01-01 to 2014-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
## Retrieving from: http://www.wunderground.com/history/airport/BOS/2015/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2015&req_city=NA&req_state=NA&req_statename=NA&format=1
## The following columns are available:
## [1] "CET" "Max_TemperatureF"
## [3] "Mean_TemperatureF" "Min_TemperatureF"
## [5] "Max_Dew_PointF" "MeanDew_PointF"
## [7] "Min_DewpointF" "Max_Humidity"
## [9] "Mean_Humidity" "Min_Humidity"
## [11] "Max_Sea_Level_PressureIn" "Mean_Sea_Level_PressureIn"
## [13] "Min_Sea_Level_PressureIn" "Max_VisibilityMiles"
## [15] "Mean_VisibilityMiles" "Min_VisibilityMiles"
## [17] "Max_Wind_SpeedMPH" "Mean_Wind_SpeedMPH"
## [19] "Max_Gust_SpeedMPH" "PrecipitationIn"
## [21] "CloudCover" "Events"
## [23] "WindDirDegrees"
## Checking Summarized Data Availability For BOS
## Found 365 records for 2015-01-01 to 2015-12-31
## Data is Available for the interval.
## Will be fetching these Columns:
## [1] "Date" "Max_TemperatureF" "Mean_TemperatureF"
## [4] "Min_TemperatureF"
str(allWeather)
## 'data.frame': 3287 obs. of 4 variables:
## $ min : int 21 41 30 24 23 28 24 28 30 28 ...
## $ mean: int 34 46 36 32 28 36 34 37 38 34 ...
## $ max : int 46 50 41 41 33 44 44 46 46 41 ...
## $ date: Date, format: "2007-01-01" "2007-01-02" ...
# Continuing, no need for cached data
temps_1 <- allWeather %>%
filter(date <= "2012-12-31")
temps_2 <- allWeather %>%
filter(date > "2012-12-31")
# View the structure of each object
str(temps_1)
## 'data.frame': 2192 obs. of 4 variables:
## $ min : int 21 41 30 24 23 28 24 28 30 28 ...
## $ mean: int 34 46 36 32 28 36 34 37 38 34 ...
## $ max : int 46 50 41 41 33 44 44 46 46 41 ...
## $ date: Date, format: "2007-01-01" "2007-01-02" ...
str(temps_2)
## 'data.frame': 1095 obs. of 4 variables:
## $ min : int 28 33 35 26 32 32 28 21 17 21 ...
## $ mean: int 33 40 42 36 39 36 35 27 28 30 ...
## $ max : int 38 46 50 46 46 41 42 33 39 39 ...
## $ date: Date, format: "2013-01-01" "2013-01-02" ...
# View the first and last rows of temps_1
head(temps_1)
## min mean max date
## 1 21 34 46 2007-01-01
## 2 41 46 50 2007-01-02
## 3 30 36 41 2007-01-03
## 4 24 32 41 2007-01-04
## 5 23 28 33 2007-01-05
## 6 28 36 44 2007-01-06
tail(temps_1)
## min mean max date
## 2187 32 36 40 2012-12-26
## 2188 35 40 46 2012-12-27
## 2189 35 40 46 2012-12-28
## 2190 39 40 42 2012-12-29
## 2191 33 40 48 2012-12-30
## 2192 28 36 44 2012-12-31
# View the first and last rows of temps_2
head(temps_2)
## min mean max date
## 1 28 33 38 2013-01-01
## 2 33 40 46 2013-01-02
## 3 35 42 50 2013-01-03
## 4 26 36 46 2013-01-04
## 5 32 39 46 2013-01-05
## 6 32 36 41 2013-01-06
tail(temps_2)
## min mean max date
## 1090 30 40 50 2015-12-26
## 1091 26 38 51 2015-12-27
## 1092 28 40 52 2015-12-28
## 1093 28 39 50 2015-12-29
## 1094 29 38 45 2015-12-30
## 1095 19 28 34 2015-12-31
# Confirm that the date column in each object is a time-based class
class(temps_1$date)
## [1] "Date"
class(temps_2$date)
## [1] "Date"
# Encode your two temperature data frames as xts objects
temps_1_xts <- as.xts(temps_1[, -4], order.by = temps_1$date)
temps_2_xts <- as.xts(temps_2[, -4], order.by = temps_2$date)
# View the first few lines of each new xts object to confirm they are properly formatted
head(temps_1_xts)
## min mean max
## 2007-01-01 21 34 46
## 2007-01-02 41 46 50
## 2007-01-03 30 36 41
## 2007-01-04 24 32 41
## 2007-01-05 23 28 33
## 2007-01-06 28 36 44
head(temps_2_xts)
## min mean max
## 2013-01-01 28 33 38
## 2013-01-02 33 40 46
## 2013-01-03 35 42 50
## 2013-01-04 26 36 46
## 2013-01-05 32 39 46
## 2013-01-06 32 36 41
# Use rbind to merge your new xts objects
temps_xts <- rbind(temps_1_xts, temps_2_xts)
# View data for the first 3 days of the last month of the first year in temps_xts
first(last(first(temps_xts, "1 year"), "1 month"), "3 days")
## min mean max
## 2007-12-01 32 41 50
## 2007-12-02 28 39 50
## 2007-12-03 30 40 50
# Identify the periodicity of temps_xts
periodicity(temps_xts)
## Daily periodicity from 2007-01-01 to 2015-12-31
# Generate a plot of mean Boston temperature for the duration of your data
plot.xts(temps_xts$mean)
# Generate a plot of mean Boston temperature from November 2010 through April 2011
plot.xts(temps_xts["2010-11-01/2011-04-30"]$mean)
lty <- c(3, 1, 3)
plot.zoo(temps_xts["2010-11-01/2011-04-30"], plot.type = "single", lty = lty)
# Subset your temperature data to include only 2010 through 2015: temps_xts_2
temps_xts_2 <- temps_xts["2010/2015"]
# Use to.period to convert temps_xts_2 to monthly periodicity
temps_monthly <- to.period(temps_xts_2, period = "months", OHLC = FALSE, indexAt = "firstof")
# Compare the periodicity and duration of temps_monthly and flights_xts
periodicity(temps_monthly)
## Monthly periodicity from 2010-01-01 to 2015-12-01
periodicity(flights_xts)
## Monthly periodicity from 2010-01-01 to 2015-12-01
idxRaw <- "2010-01-01 ; 2010-02-01 ; 2010-03-01 ; 2010-04-01 ; 2010-05-01 ; 2010-06-01 ; 2010-07-01 ; 2010-08-01 ; 2010-09-01 ; 2010-10-01 ; 2010-11-01 ; 2010-12-01 ; 2011-01-01 ; 2011-02-01 ; 2011-03-01 ; 2011-04-01 ; 2011-05-01 ; 2011-06-01 ; 2011-07-01 ; 2011-08-01 ; 2011-09-01 ; 2011-10-01 ; 2011-11-01 ; 2011-12-01 ; 2012-01-01 ; 2012-02-01 ; 2012-03-01 ; 2012-04-01 ; 2012-05-01 ; 2012-06-01 ; 2012-07-01 ; 2012-08-01 ; 2012-09-01 ; 2012-10-01 ; 2012-11-01 ; 2012-12-01 ; 2013-01-01 ; 2013-02-01 ; 2013-03-01 ; 2013-04-01 ; 2013-05-01 ; 2013-06-01 ; 2013-07-01 ; 2013-08-01 ; 2013-09-01 ; 2013-10-01 ; 2013-11-01 ; 2013-12-01 ; 2014-01-01 ; 2014-02-01 ; 2014-03-01 ; 2014-04-01 ; 2014-05-01 ; 2014-06-01 ; 2014-07-01 ; 2014-08-01 ; 2014-09-01 ; 2014-10-01 ; 2014-11-01 ; 2014-12-01 ; 2015-01-01 ; 2015-02-01 ; 2015-03-01 ; 2015-04-01 ; 2015-05-01 ; 2015-06-01 ; 2015-07-01 ; 2015-08-01 ; 2015-09-01 ; 2015-10-01 ; 2015-11-01 ; 2015-12-01"
index <- as.Date(strsplit(idxRaw, " ; ")[[1]])
# Split temps_xts_2 into separate lists per month
monthly_split <- split(temps_xts_2$mean , f = "months")
# Use lapply to generate the monthly mean of mean temperatures
mean_of_means <- lapply(monthly_split, FUN = mean)
# Use as.xts to generate an xts object of average monthly temperature data
temps_monthly <- as.xts(as.numeric(mean_of_means), order.by = index)
# Compare the periodicity and duration of your new temps_monthly and flights_xts
periodicity(temps_monthly)
## Monthly periodicity from 2010-01-01 to 2015-12-01
periodicity(flights_xts)
## Monthly periodicity from 2010-01-01 to 2015-12-01
# Use merge to combine your flights and temperature objects
flights_temps <- merge(flights_xts, temps_monthly)
# Examine the first few rows of your combined xts object
head(flights_temps)
## total_flights delay_flights cancel_flights divert_flights
## 2010-01-01 8912 1989 279 9
## 2010-02-01 8418 1918 785 23
## 2010-03-01 9637 2720 242 32
## 2010-04-01 9363 1312 58 7
## 2010-05-01 9360 1569 102 8
## 2010-06-01 9502 1955 157 5
## pct_delay pct_cancel pct_divert temps_monthly
## 2010-01-01 22.31822 3.1306104 0.10098743 36.12903
## 2010-02-01 22.78451 9.3252554 0.27322404 37.71429
## 2010-03-01 28.22455 2.5111549 0.33205354 42.22581
## 2010-04-01 14.01260 0.6194596 0.07476236 51.26667
## 2010-05-01 16.76282 1.0897436 0.08547009 56.87097
## 2010-06-01 20.57462 1.6522837 0.05262050 63.56667
# Use plot.zoo to plot these two columns in a single panel
lty <- c(1, 2)
plot.zoo(flights_temps[,c("pct_delay", "temps_monthly")], plot.type = "single", lty = lty)
labels <- c("Pct. Delay", "Temperature")
legend("topright", lty = lty, legend = labels, bg = "white")
windData <- "7.19 ; 5.21 ; 4.9 ; 4.7 ; 4.13 ; 4.3 ; 4.74 ; 4.94 ; 4.57 ; 4.48 ; 5.97 ; 5.87 ; 4.58 ; 6 ; 5.58 ; 5.23 ; 4.71 ; 4.5 ; 3.94 ; 4.65 ; 4.73 ; 5.39 ; 4.2 ; 5.65 ; 5.55 ; 6.03 ; 5.29 ; 5.6 ; 4.03 ; 4.1 ; 4.71 ; 4.55 ; 4.33 ; 4.77 ; 4.63 ; 5.48 ; 5.68 ; 4.82 ; 6 ; 4.93 ; 5.19 ; 4.8 ; 5.19 ; 4.74 ; 4.7 ; 3.52 ; 4.87 ; 4.45 ; 3.87 ; 3.71 ; 5.16 ; 4.2 ; 4.06 ; 4.2 ; 4.32 ; 4.19 ; 4.27 ; 4.65 ; 3.67 ; 4.13 ; 4.77 ; 4.79 ; 5.26 ; 5 ; 4.52 ; 4.47 ; 4.52 ; 4.26 ; 5.03 ; 4.29 ; 4.07 ; 3.84"
visData <- "5.77 ; 5.86 ; 5.81 ; 6 ; 6 ; 6 ; 6 ; 6 ; 5.93 ; 6 ; 5.83 ; 5.97 ; 5.61 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 5.97 ; 6 ; 5.7 ; 5.61 ; 5.71 ; 5.66 ; 5.9 ; 6.37 ; 6.39 ; 7.5 ; 7.29 ; 7.77 ; 7.8 ; 7.65 ; 7.4 ; 6.68 ; 6.81 ; 6.82 ; 7 ; 7.57 ; 6.94 ; 6.83 ; 6.48 ; 6.45 ; 7.6 ; 9.03 ; 8.2 ; 8.97 ; 6.03 ; 8.57 ; 8.58 ; 7.77 ; 7.74 ; 7.77 ; 8.03 ; 8.55 ; 7.77 ; 8.23 ; 8.2 ; 8.23 ; 8.55 ; 8.79 ; 7.9 ; 8.6 ; 8.26 ; 7.67 ; 8.06 ; 7.87 ; 8.1 ; 7.81 ; 9.33 ; 8.77"
idxData <- "2010-01-01 ; 2010-02-01 ; 2010-03-01 ; 2010-04-01 ; 2010-05-01 ; 2010-06-01 ; 2010-07-01 ; 2010-08-01 ; 2010-09-01 ; 2010-10-01 ; 2010-11-01 ; 2010-12-01 ; 2011-01-01 ; 2011-02-01 ; 2011-03-01 ; 2011-04-01 ; 2011-05-01 ; 2011-06-01 ; 2011-07-01 ; 2011-08-01 ; 2011-09-01 ; 2011-10-01 ; 2011-11-01 ; 2011-12-01 ; 2012-01-01 ; 2012-02-01 ; 2012-03-01 ; 2012-04-01 ; 2012-05-01 ; 2012-06-01 ; 2012-07-01 ; 2012-08-01 ; 2012-09-01 ; 2012-10-01 ; 2012-11-01 ; 2012-12-01 ; 2013-01-01 ; 2013-02-01 ; 2013-03-01 ; 2013-04-01 ; 2013-05-01 ; 2013-06-01 ; 2013-07-01 ; 2013-08-01 ; 2013-09-01 ; 2013-10-01 ; 2013-11-01 ; 2013-12-01 ; 2014-01-01 ; 2014-02-01 ; 2014-03-01 ; 2014-04-01 ; 2014-05-01 ; 2014-06-01 ; 2014-07-01 ; 2014-08-01 ; 2014-09-01 ; 2014-10-01 ; 2014-11-01 ; 2014-12-01 ; 2015-01-01 ; 2015-02-01 ; 2015-03-01 ; 2015-04-01 ; 2015-05-01 ; 2015-06-01 ; 2015-07-01 ; 2015-08-01 ; 2015-09-01 ; 2015-10-01 ; 2015-11-01 ; 2015-12-01"
wind <- as.xts(as.numeric(strsplit(windData, " ; ")[[1]]),
order.by=as.Date(strsplit(idxData, " ; ")[[1]])
)
vis <- as.xts(as.numeric(strsplit(visData, " ; ")[[1]]),
order.by=as.Date(strsplit(idxData, " ; ")[[1]])
)
# Confirm the periodicity and duration of the vis and wind data
periodicity(vis)
## Monthly periodicity from 2010-01-01 to 2015-12-01
periodicity(wind)
## Monthly periodicity from 2010-01-01 to 2015-12-01
# Merge vis and wind with your existing flights_temps data
flights_weather <- merge(flights_temps, vis, wind)
# View the first few rows of your flights_weather data
head(flights_weather)
## total_flights delay_flights cancel_flights divert_flights
## 2010-01-01 8912 1989 279 9
## 2010-02-01 8418 1918 785 23
## 2010-03-01 9637 2720 242 32
## 2010-04-01 9363 1312 58 7
## 2010-05-01 9360 1569 102 8
## 2010-06-01 9502 1955 157 5
## pct_delay pct_cancel pct_divert temps_monthly vis wind
## 2010-01-01 22.31822 3.1306104 0.10098743 36.12903 5.77 7.19
## 2010-02-01 22.78451 9.3252554 0.27322404 37.71429 5.86 5.21
## 2010-03-01 28.22455 2.5111549 0.33205354 42.22581 5.81 4.90
## 2010-04-01 14.01260 0.6194596 0.07476236 51.26667 6.00 4.70
## 2010-05-01 16.76282 1.0897436 0.08547009 56.87097 6.00 4.13
## 2010-06-01 20.57462 1.6522837 0.05262050 63.56667 6.00 4.30
Chapter 3 - Economic Data
Handling missingness - missing values confound identification of trends and/or statistical tests:
Lagging and differencing - moving averages in the data:
Rolling functions:
Example code includes:
gdpDate <- "1947 Q1 ; 1947 Q2 ; 1947 Q3 ; 1947 Q4 ; 1948 Q1 ; 1948 Q2 ; 1948 Q3 ; 1948 Q4 ; 1949 Q1 ; 1949 Q2 ; 1949 Q3 ; 1949 Q4 ; 1950 Q1 ; 1950 Q2 ; 1950 Q3 ; 1950 Q4 ; 1951 Q1 ; 1951 Q2 ; 1951 Q3 ; 1951 Q4 ; 1952 Q1 ; 1952 Q2 ; 1952 Q3 ; 1952 Q4 ; 1953 Q1 ; 1953 Q2 ; 1953 Q3 ; 1953 Q4 ; 1954 Q1 ; 1954 Q2 ; 1954 Q3 ; 1954 Q4 ; 1955 Q1 ; 1955 Q2 ; 1955 Q3 ; 1955 Q4 ; 1956 Q1 ; 1956 Q2 ; 1956 Q3 ; 1956 Q4 ; 1957 Q1 ; 1957 Q2 ; 1957 Q3 ; 1957 Q4 ; 1958 Q1 ; 1958 Q2 ; 1958 Q3 ; 1958 Q4 ; 1959 Q1 ; 1959 Q2 ; 1959 Q3 ; 1959 Q4 ; 1960 Q1 ; 1960 Q2 ; 1960 Q3 ; 1960 Q4 ; 1961 Q1 ; 1961 Q2 ; 1961 Q3 ; 1961 Q4 ; 1962 Q1 ; 1962 Q2 ; 1962 Q3 ; 1962 Q4 ; 1963 Q1 ; 1963 Q2 ; 1963 Q3 ; 1963 Q4 ; 1964 Q1 ; 1964 Q2 ; 1964 Q3 ; 1964 Q4 ; 1965 Q1 ; 1965 Q2 ; 1965 Q3 ; 1965 Q4 ; 1966 Q1 ; 1966 Q2 ; 1966 Q3 ; 1966 Q4 ; 1967 Q1 ; 1967 Q2 ; 1967 Q3 ; 1967 Q4 ; 1968 Q1 ; 1968 Q2 ; 1968 Q3 ; 1968 Q4 ; 1969 Q1 ; 1969 Q2 ; 1969 Q3 ; 1969 Q4 ; 1970 Q1 ; 1970 Q2 ; 1970 Q3 ; 1970 Q4 ; 1971 Q1 ; 1971 Q2 ; 1971 Q3 ; 1971 Q4 ; 1972 Q1 ; 1972 Q2 ; 1972 Q3 ; 1972 Q4 ; 1973 Q1 ; 1973 Q2 ; 1973 Q3 ; 1973 Q4 ; 1974 Q1 ; 1974 Q2 ; 1974 Q3 ; 1974 Q4 ; 1975 Q1 ; 1975 Q2 ; 1975 Q3 ; 1975 Q4 ; 1976 Q1 ; 1976 Q2 ; 1976 Q3 ; 1976 Q4 ; 1977 Q1 ; 1977 Q2 ; 1977 Q3 ; 1977 Q4 ; 1978 Q1 ; 1978 Q2 ; 1978 Q3 ; 1978 Q4 ; 1979 Q1 ; 1979 Q2 ; 1979 Q3 ; 1979 Q4 ; 1980 Q1 ; 1980 Q2 ; 1980 Q3 ; 1980 Q4 ; 1981 Q1 ; 1981 Q2 ; 1981 Q3 ; 1981 Q4 ; 1982 Q1 ; 1982 Q2 ; 1982 Q3 ; 1982 Q4 ; 1983 Q1 ; 1983 Q2 ; 1983 Q3 ; 1983 Q4 ; 1984 Q1 ; 1984 Q2 ; 1984 Q3 ; 1984 Q4 ; 1985 Q1 ; 1985 Q2 ; 1985 Q3 ; 1985 Q4 ; 1986 Q1 ; 1986 Q2 ; 1986 Q3 ; 1986 Q4 ; 1987 Q1 ; 1987 Q2 ; 1987 Q3 ; 1987 Q4 ; 1988 Q1 ; 1988 Q2 ; 1988 Q3 ; 1988 Q4 ; 1989 Q1 ; 1989 Q2 ; 1989 Q3 ; 1989 Q4 ; 1990 Q1 ; 1990 Q2 ; 1990 Q3 ; 1990 Q4 ; 1991 Q1 ; 1991 Q2 ; 1991 Q3 ; 1991 Q4 ; 1992 Q1 ; 1992 Q2 ; 1992 Q3 ; 1992 Q4 ; 1993 Q1 ; 1993 Q2 ; 1993 Q3 ; 1993 Q4 ; 1994 Q1 ; 1994 Q2 ; 1994 Q3 ; 1994 Q4 ; 1995 Q1 ; 1995 Q2 ; 1995 Q3 ; 1995 Q4 ; 1996 Q1 ; 1996 Q2 ; 1996 Q3 ; 1996 Q4 ; 1997 Q1 ; 1997 Q2 ; 1997 Q3 ; 1997 Q4 ; 1998 Q1 ; 1998 Q2 ; 1998 Q3 ; 1998 Q4 ; 1999 Q1 ; 1999 Q2 ; 1999 Q3 ; 1999 Q4 ; 2000 Q1 ; 2000 Q2 ; 2000 Q3 ; 2000 Q4 ; 2001 Q1 ; 2001 Q2 ; 2001 Q3 ; 2001 Q4 ; 2002 Q1 ; 2002 Q2 ; 2002 Q3 ; 2002 Q4 ; 2003 Q1 ; 2003 Q2 ; 2003 Q3 ; 2003 Q4 ; 2004 Q1 ; 2004 Q2 ; 2004 Q3 ; 2004 Q4 ; 2005 Q1 ; 2005 Q2 ; 2005 Q3 ; 2005 Q4 ; 2006 Q1 ; 2006 Q2 ; 2006 Q3 ; 2006 Q4 ; 2007 Q1 ; 2007 Q2 ; 2007 Q3 ; 2007 Q4 ; 2008 Q1 ; 2008 Q2 ; 2008 Q3 ; 2008 Q4 ; 2009 Q1 ; 2009 Q2 ; 2009 Q3 ; 2009 Q4 ; 2010 Q1 ; 2010 Q2 ; 2010 Q3 ; 2010 Q4 ; 2011 Q1 ; 2011 Q2 ; 2011 Q3 ; 2011 Q4 ; 2012 Q1 ; 2012 Q2 ; 2012 Q3 ; 2012 Q4 ; 2013 Q1 ; 2013 Q2 ; 2013 Q3 ; 2013 Q4 ; 2014 Q1 ; 2014 Q2 ; 2014 Q3 ; 2014 Q4 ; 2015 Q1 ; 2015 Q2 ; 2015 Q3 ; 2015 Q4 ; 2016 Q1 ; 2016 Q2 ; 2016 Q3"
gdpGDP <- "243.1 ; 246.3 ; 250.1 ; 260.3 ; 266.2 ; 272.9 ; 279.5 ; 280.7 ; 275.4 ; NA ; NA ; 271 ; 281.2 ; NA ; 308.5 ; 320.3 ; 336.4 ; NA ; 351.8 ; 356.6 ; NA ; NA ; NA ; 381.2 ; 388.5 ; NA ; NA ; NA ; NA ; NA ; 391.6 ; 400.3 ; 413.8 ; 422.2 ; 430.9 ; NA ; NA ; 446.8 ; 452 ; 461.3 ; 470.6 ; 472.8 ; NA ; NA ; NA ; NA ; 486.7 ; 500.4 ; 511.1 ; 524.2 ; 525.2 ; 529.3 ; 543.3 ; 542.7 ; 546 ; 541.1 ; 545.9 ; 557.4 ; 568.2 ; 581.6 ; 595.2 ; 602.6 ; 609.6 ; NA ; NA ; NA ; NA ; 654.8 ; 671.1 ; 680.8 ; 692.8 ; 698.4 ; 719.2 ; 732.4 ; NA ; NA ; NA ; NA ; 820.8 ; 834.9 ; 846 ; 851.1 ; 866.6 ; 883.2 ; NA ; 936.3 ; 952.3 ; NA ; 995.4 ; 1011.4 ; 1032 ; 1040.7 ; 1053.5 ; 1070.1 ; NA ; 1091.5 ; 1137.8 ; 1159.4 ; 1180.3 ; 1193.6 ; 1233.8 ; NA ; NA ; 1332 ; 1380.7 ; 1417.6 ; 1436.8 ; 1479.1 ; 1494.7 ; 1534.2 ; NA ; 1603 ; NA ; NA ; 1713.8 ; 1765.9 ; 1824.5 ; 1856.9 ; 1890.5 ; 1938.4 ; 1992.5 ; 2060.2 ; 2122.4 ; NA ; NA ; 2336.6 ; 2398.9 ; 2482.2 ; 2531.6 ; NA ; 2670.4 ; 2730.7 ; 2796.5 ; 2799.9 ; 2860 ; NA ; 3131.8 ; 3167.3 ; 3261.2 ; 3283.5 ; 3273.8 ; NA ; NA ; NA ; 3480.3 ; 3583.8 ; 3692.3 ; 3796.1 ; NA ; NA ; NA ; NA ; 4237 ; 4302.3 ; 4394.6 ; 4453.1 ; NA ; NA ; NA ; NA ; 4736.2 ; 4821.5 ; 4900.5 ; 5022.7 ; NA ; NA ; NA ; NA ; NA ; NA ; 5711.6 ; 5763.4 ; 5890.8 ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; 7545.3 ; 7604.9 ; 7706.5 ; 7799.5 ; 7893.1 ; 8061.5 ; 8159 ; 8287.1 ; 8402.1 ; 8551.9 ; 8691.8 ; 8788.3 ; 8889.7 ; 8994.7 ; 9146.5 ; 9325.7 ; 9447.1 ; NA ; 9712.3 ; 9926.1 ; 10031 ; 10278.3 ; 10357.4 ; 10472.3 ; 10508.1 ; 10638.4 ; 10639.5 ; 10701.3 ; NA ; NA ; 11037.1 ; 11103.8 ; 11230.1 ; 11370.7 ; 11625.1 ; 11816.8 ; 11988.4 ; 12181.4 ; 12367.7 ; 12562.2 ; 12813.7 ; 12974.1 ; 13205.4 ; 13381.6 ; 13648.9 ; NA ; 13908.5 ; 14066.4 ; 14233.2 ; 14422.3 ; 14569.7 ; 14685.3 ; 14668.4 ; 14813 ; 14843 ; 14549.9 ; 14383.9 ; 14340.4 ; 14384.1 ; 14566.5 ; 14681.1 ; 14888.6 ; 15057.7 ; 15230.2 ; NA ; 15460.9 ; 15587.1 ; 15785.3 ; 15973.9 ; 16121.9 ; 16227.9 ; 16297.3 ; 16475.4 ; 16541.4 ; 16749.3 ; 16999.9 ; 17025.2 ; 17285.6 ; 17569.4 ; 17692.2 ; NA ; 17998.3 ; 18141.9 ; 18222.8 ; 18281.6 ; 18450.1 ; 18651.2"
gdp <- data.frame(date=strsplit(gdpDate, " ; ")[[1]],
gdp_billions=as.numeric(strsplit(gdpGDP, " ; ")[[1]]),
stringsAsFactors=TRUE
) # want the date to be a factor to match input
## Warning in data.frame(date = strsplit(gdpDate, " ; ")[[1]], gdp_billions =
## as.numeric(strsplit(gdpGDP, : NAs introduced by coercion
sum(is.na(gdp))
## [1] 80
str(gdp)
## 'data.frame': 279 obs. of 2 variables:
## $ date : Factor w/ 279 levels "1947 Q1","1947 Q2",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ gdp_billions: num 243 246 250 260 266 ...
# Get a summary of your GDP data
summary(gdp)
## date gdp_billions
## 1947 Q1: 1 Min. : 243.1
## 1947 Q2: 1 1st Qu.: 708.8
## 1947 Q3: 1 Median : 3167.3
## 1947 Q4: 1 Mean : 6186.6
## 1948 Q1: 1 3rd Qu.:11497.9
## 1948 Q2: 1 Max. :18651.2
## (Other):273 NA's :80
# Convert GDP date column to time object
gdp$date <- as.yearqtr(gdp$date)
# Convert GDP data to xts
gdp_xts <- as.xts(gdp[, -1], order.by = gdp$date)
names(gdp_xts) <- "gdp"
# Plot GDP data over time
plot.xts(gdp_xts)
# Fill NAs in gdp_xts with the last observation carried forward
gdp_locf <- na.locf(gdp_xts)
# Fill NAs in gdp_xts with the next observation carried backward
gdp_nocb <- na.locf(gdp_xts, fromLast=TRUE)
# Produce a plot for each of your new xts objects
par(mfrow = c(2,1))
plot.xts(gdp_locf, major.format = "%Y")
plot.xts(gdp_nocb, major.format = "%Y")
par(mfrow = c(1,1))
# Query for GDP in 1993 in both gdp_locf and gdp_nocb
gdp_locf["1993"]
## gdp
## 1993 Q2 5890.8
## 1993 Q3 5890.8
## 1993 Q4 5890.8
## 1994 Q1 5890.8
gdp_nocb["1993"]
## gdp
## 1993 Q2 7545.3
## 1993 Q3 7545.3
## 1993 Q4 7545.3
## 1994 Q1 7545.3
# Fill NAs in gdp_xts using linear approximation
gdp_approx <- na.approx(gdp_xts)
# Plot your new xts object
plot.xts(gdp_approx, major.format = "%Y")
# Query for GDP in 1993 in gdp_approx
gdp_approx["1993"]
## gdp
## 1993 Q2 6966.225
## 1993 Q3 7048.950
## 1993 Q4 7131.675
## 1994 Q1 7214.400
unemCore1 <- "7.9 ; 7.7 ; 7.6 ; 7.7 ; 7.4 ; 7.6 ; NA ; NA ; 7.6 ; 7.7 ; 7.8 ; NA ; 7.5 ; 7.6 ; 7.4 ; 7.2 ; 7 ; 7.2 ; 6.9 ; 7 ; 6.8 ; NA ; NA ; 6.4 ; 6.4 ; 6.3 ; 6.3 ; 6.1 ; 6 ; 5.9 ; 6.2 ; 5.9 ; 6 ; 5.8 ; 5.9 ; 6 ; 5.9 ; 5.9 ; NA ; NA ; NA ; NA ; NA ; 6 ; 5.9 ; 6 ; 5.9 ; 6 ; 6.3 ; NA ; NA ; 6.9 ; 7.5 ; 7.6 ; 7.8 ; 7.7 ; 7.5 ; NA ; NA ; 7.2 ; 7.5 ; 7.4 ; 7.4 ; 7.2 ; NA ; NA ; 7.2 ; 7.4 ; 7.6 ; 7.9 ; 8.3 ; 8.5 ; 8.6 ; 8.9 ; 9 ; 9.3 ; 9.4 ; NA ; NA ; NA ; NA ; NA ; 10.8 ; 10.8 ; 10.4 ; 10.4 ; 10.3 ; 10.2 ; 10.1 ; 10.1 ; 9.4 ; 9.5 ; 9.2 ; NA ; NA ; NA ; 8 ; NA ; 7.8 ; NA ; NA ; 7.2 ; 7.5 ; 7.5 ; 7.3 ; 7.4 ; 7.2 ; 7.3 ; 7.3 ; 7.2 ; 7.2 ; 7.3 ; 7.2 ; 7.4 ; 7.4 ; 7.1 ; 7.1 ; 7.1 ; 7 ; 7 ; 6.7 ; 7.2 ; 7.2 ; 7.1 ; 7.2 ; 7.2 ; 7 ; 6.9 ; 7 ; 7 ; 6.9 ; 6.6 ; 6.6 ; 6.6 ; 6.6 ; 6.3 ; 6.3 ; 6.2 ; 6.1 ; 6 ; 5.9 ; 6 ; 5.8 ; 5.7 ; NA ; NA ; NA ; 5.4 ; 5.6 ; 5.4 ; 5.4 ; 5.6 ; 5.4 ; NA ; NA ; NA ; 5.4 ; 5.2 ; 5 ; 5.2 ; NA ; NA ; NA ; NA ; 5.3 ; 5.3 ; 5.4 ; NA ; 5.4 ; 5.3 ; 5.2 ; 5.4 ; 5.4 ; 5.2 ; 5.5 ; 5.7 ; 5.9 ; 5.9 ; 6.2 ; 6.3 ; 6.4 ; 6.6 ; 6.8 ; 6.7 ; 6.9 ; 6.9 ; 6.8 ; 6.9 ; 6.9 ; 7 ; 7 ; 7.3 ; 7.3 ; 7.4 ; 7.4 ; 7.4 ; 7.6 ; 7.8 ; 7.7 ; 7.6 ; 7.6 ; 7.3 ; 7.4 ; 7.4 ; 7.3 ; 7.1 ; 7 ; 7.1 ; 7.1 ; 7 ; 6.9 ; 6.8 ; 6.7 ; 6.8 ; 6.6 ; 6.5 ; 6.6 ; NA ; 6.5 ; 6.4 ; 6.1 ; NA ; NA ; 6 ; 5.9 ; 5.8 ; 5.6 ; 5.5 ; 5.6 ; 5.4 ; 5.4 ; 5.8 ; 5.6 ; 5.6 ; 5.7 ; 5.7 ; 5.6 ; 5.5 ; 5.6 ; NA ; NA ; NA ; NA ; NA ; 5.6 ; 5.3 ; 5.5 ; 5.1 ; 5.2 ; 5.2 ; 5.4 ; 5.4 ; 5.3 ; 5.2 ; 5.2 ; 5.1 ; 4.9 ; 5 ; 4.9 ; 4.8 ; 4.9 ; 4.7 ; 4.6 ; 4.7 ; 4.6 ; 4.6 ; 4.7 ; 4.3 ; 4.4 ; 4.5 ; 4.5 ; 4.5 ; 4.6 ; 4.5 ; NA ; NA ; NA ; 4.4 ; 4.2 ; 4.3 ; 4.2 ; 4.3 ; 4.3 ; 4.2 ; 4.2 ; 4.1 ; 4.1 ; 4 ; 4 ; 4.1 ; 4 ; 3.8 ; 4 ; 4 ; 4 ; 4.1 ; 3.9 ; NA ; NA ; NA ; 4.2 ; 4.2 ; 4.3 ; 4.4 ; 4.3 ; 4.5 ; 4.6 ; 4.9 ; 5 ; 5.3 ; 5.5 ; 5.7 ; NA ; NA ; NA ; 5.9 ; 5.8 ; 5.8 ; 5.8 ; 5.7 ; 5.7 ; 5.7 ; 5.9 ; 6 ; 5.8 ; 5.9 ; 5.9 ; 6 ; 6.1 ; 6.3 ; 6.2 ; 6.1 ; 6.1 ; 6 ; 5.8 ; 5.7 ; 5.7 ; 5.6 ; 5.8 ; 5.6 ; 5.6 ; 5.6 ; 5.5 ; 5.4 ; 5.4 ; 5.5 ; 5.4 ; 5.4 ; 5.3 ; 5.4 ; 5.2 ; 5.2 ; 5.1 ; 5 ; 5 ; 4.9 ; 5 ; 5 ; 5 ; 4.9 ; 4.7 ; 4.8 ; 4.7 ; 4.7 ; 4.6 ; 4.6 ; 4.7 ; 4.7 ; 4.5 ; 4.4 ; 4.5 ; 4.4 ; 4.6 ; 4.5 ; 4.4 ; 4.5 ; 4.4 ; 4.6 ; 4.7 ; 4.6 ; 4.7 ; 4.7 ; 4.7 ; 5 ; 5 ; 4.9 ; 5.1 ; 5 ; 5.4 ; 5.6 ; 5.8 ; NA ; NA ; NA ; 6.8 ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; 10 ; 9.9 ; 9.9 ; 9.8 ; 9.8 ; 9.9 ; 9.9 ; 9.6 ; 9.4 ; 9.4 ; 9.5 ; 9.5 ; 9.4 ; 9.8 ; 9.3 ; 9.1 ; 9 ; 9 ; 9.1 ; 9 ; 9.1 ; 9 ; 9 ; 9 ; 8.8 ; NA ; NA ; NA ; NA ; NA ; 8.2 ; 8.2 ; 8.2 ; 8.2 ; 8.1 ; 7.8 ; 7.8 ; 7.7 ; 7.9 ; 8 ; 7.7 ; 7.5 ; 7.6 ; 7.5 ; 7.5 ; 7.3 ; 7.3 ; 7.3 ; 7.2 ; 6.9 ; 6.7 ; 6.6 ; 6.7 ; 6.7 ; 6.2 ; 6.2 ; 6.1 ; 6.2 ; 6.2 ; 6 ; 5.7 ; 5.8 ; 5.6 ; 5.7 ; 5.5 ; 5.5 ; 5.4 ; 5.5 ; 5.3 ; 5.3 ; 5.1 ; 5.1 ; 5 ; 5 ; 5 ; 11.6 ; NA ; 10.9 ; NA ; 9.4 ; 9.8 ; 9.7 ; 9 ; 9 ; 8.3 ; 8.3 ; 8.2 ; 9.5 ; 9.2 ; 8.8 ; NA ; 7.6 ; 8.2 ; 7.8 ; NA ; 7.5 ; 6.6 ; NA ; 6.2 ; 7.5 ; 7.2 ; 6.9 ; 6 ; 5.9 ; 6.4 ; 6.5"
unemCore2 <- "5.9 ; 6 ; 5.2 ; NA ; 5.7 ; 6.7 ; 6.4 ; 6.2 ; 5.4 ; 5.2 ; 5.7 ; 5.4 ; NA ; 5.4 ; 4.7 ; 4.8 ; 4.9 ; 6.1 ; 5.8 ; 5.8 ; 5.3 ; 5.6 ; 6.1 ; 6.1 ; 5.7 ; 5.6 ; 5.2 ; NA ; 5.1 ; 6.7 ; 6.4 ; 6.3 ; 5.7 ; 5.9 ; 6.5 ; NA ; 6.5 ; 6.8 ; 6.4 ; 6.7 ; 7 ; 8.4 ; 8.2 ; 8 ; 7.5 ; 7.5 ; 8 ; 8 ; 7.6 ; 7.6 ; 7.1 ; 7.3 ; 7.4 ; 8.3 ; 8 ; 7.7 ; 6.9 ; 6.8 ; 7.1 ; 6.6 ; 6.4 ; 6.4 ; 5.7 ; 5.6 ; 5.6 ; 6.5 ; 6 ; 5.7 ; 5 ; 4.5 ; 4.7 ; 4.7 ; 4.4 ; 4.4 ; 3.7 ; 3.7 ; 3.8 ; 4.9 ; 4.5 ; 4.4 ; NA ; 3.8 ; 4.2 ; 4.2 ; 3.8 ; 3.8 ; 3.4 ; 3.4 ; 3.5 ; NA ; 4.4 ; 4.3 ; 3.8 ; 3.9 ; 4.2 ; 4.1 ; 3.8 ; 3.9 ; 3.3 ; 3.3 ; NA ; 4.3 ; 4.2 ; 4 ; 3.3 ; 3.2 ; 3.4 ; 3.3 ; 2.8 ; 2.9 ; 2.4 ; 2.5 ; 2.6 ; 3.9 ; 3.7 ; 3.6 ; 3 ; 3 ; 3.4 ; 3.4 ; 3.1 ; 3.2 ; 2.8 ; 3 ; 3.1 ; 4.3 ; 4.1 ; 4 ; 3.7 ; 3.7 ; 4.2 ; 4.3 ; 4.1 ; 4.4 ; 4.1 ; 4.3 ; 4.5 ; 5.8 ; NA ; 5.9 ; 5.6 ; 5.7 ; 6.1 ; 6.5 ; 6.4 ; 6.8 ; 6.5 ; 7 ; 7.3 ; 8.7 ; 8.8 ; 8.9 ; 8.3 ; 8.6 ; 8.9 ; 8.9 ; 8.7 ; 8.8 ; 8.3 ; NA ; NA ; 9.4 ; 9.2 ; 9 ; 8.3 ; 8.4 ; 8.8 ; 8.7 ; 8.3 ; 8.4 ; 7.7 ; NA ; 7.6 ; 8.6 ; 8.2 ; 7.8 ; 7.1 ; 6.9 ; 7.1 ; 6.9 ; 6.5 ; 6.6 ; 6.1 ; 6 ; 6 ; 7.3 ; 6.9 ; 6.7 ; 6.1 ; 5.8 ; 6.2 ; 6.1 ; 5.8 ; 5.8 ; 5.4 ; 5.3 ; 5.3 ; 6.4 ; 5.9 ; 5.7 ; 5.3 ; NA ; 5.5 ; 5.5 ; 5.1 ; 5.2 ; 4.6 ; 4.6 ; 4.6 ; 5.7 ; 5.2 ; 5 ; 4.4 ; 4.4 ; 4.5 ; 4.6 ; 4 ; 4.2 ; 3.7 ; 3.8 ; 3.8 ; 4.9 ; 4.6 ; 4.4 ; 3.9 ; 3.8 ; 4.2 ; 4.1 ; 3.7 ; 3.8 ; 3.2 ; 3.2 ; 3.3 ; 4.2 ; 3.9 ; 3.8 ; 3 ; 3.1 ; 3.5 ; 3.4 ; 3 ; 3.2 ; 2.8 ; 2.8 ; 2.9 ; 3.9 ; 3.6 ; 3.4 ; 3.1 ; 3 ; 3.4 ; 3.5 ; 3 ; 3.2 ; 2.7 ; 2.7 ; 2.7 ; 3.6 ; 3.3 ; 3.1 ; 2.4 ; 2.5 ; 2.8 ; 2.8 ; 2.5 ; 2.5 ; 2.1 ; 2.3 ; 2.4 ; 3.7 ; 3.5 ; 3.6 ; 3.2 ; 3.3 ; 3.8 ; 3.9 ; 3.8 ; 4 ; 3.7 ; 4 ; 4.2 ; 5.5 ; 5.3 ; 5.3 ; 5 ; 5 ; 5.5 ; 5.5 ; 5.3 ; 5.4 ; 5 ; 5.2 ; 5.2 ; 6.3 ; 6 ; 5.9 ; 5.5 ; 5.6 ; 6.1 ; 5.9 ; 5.7 ; 5.8 ; 5.2 ; 5.3 ; 5.2 ; 6.2 ; 5.8 ; 5.7 ; 5.1 ; 5.1 ; 5.5 ; 5.3 ; 4.8 ; 4.9 ; 4.4 ; 4.5 ; 4.5 ; 5.6 ; 5.4 ; 5.1 ; 4.7 ; 4.6 ; 5 ; 4.9 ; 4.5 ; 4.9 ; 4.4 ; 4.7 ; 4.6 ; 5.5 ; 5.4 ; 5.2 ; 4.9 ; 4.7 ; 5.1 ; 5 ; 4.7 ; 4.9 ; 4.3 ; 4.5 ; 4.6 ; 5.6 ; 5.2 ; 4.8 ; NA ; NA ; 4.8 ; 4.7 ; 4.3 ; 4.5 ; 4 ; 4.1 ; 4.4 ; 5.4 ; 5.2 ; 5 ; 4.6 ; 5 ; 5.6 ; 5.7 ; 5.6 ; 5.9 ; 5.7 ; 6.1 ; 6.6 ; 7.9 ; 7.9 ; 7.8 ; 7.3 ; 7.7 ; 8.3 ; 8.4 ; 8.2 ; 8.6 ; 8.2 ; 8.3 ; 8.5 ; 9.6 ; 9.2 ; 8.9 ; 8.3 ; 8.2 ; 8.4 ; 8.3 ; 7.9 ; 8 ; 7.5 ; 7.7 ; 7.6 ; 8.5 ; 8.1 ; 7.7 ; 7.2 ; 7.1 ; 7.6 ; 7.4 ; 6.9 ; 7.1 ; 6.5 ; 6.4 ; 6.6 ; 7.4 ; 7.2 ; 6.8 ; 6.3 ; 6.3 ; 6.9 ; 6.9 ; 6.6 ; 6.6 ; 6.2 ; 6.2 ; 6.5 ; 7.6 ; 7.2 ; 7 ; 6.6 ; 6.6 ; 7.3 ; 7 ; 6.6 ; 6.6 ; 6.2 ; 6 ; 5.9 ; 6.8 ; 6.5 ; 6.2 ; 5.5 ; 5.5 ; 6 ; 6 ; NA ; 5.7 ; 5 ; 5 ; 4.9 ; 5.8 ; 5.5 ; 5.2 ; 4.7 ; 4.9 ; 5.2 ; 5.2 ; 4.7 ; 4.9 ; 4.5 ; 4.5 ; 4.6"
unemIndex1 <- "Jan 1976 ; Feb 1976 ; Mar 1976 ; Apr 1976 ; May 1976 ; Jun 1976 ; Jul 1976 ; Aug 1976 ; Sep 1976 ; Oct 1976 ; Nov 1976 ; Dec 1976 ; Jan 1977 ; Feb 1977 ; Mar 1977 ; Apr 1977 ; May 1977 ; Jun 1977 ; Jul 1977 ; Aug 1977 ; Sep 1977 ; Oct 1977 ; Nov 1977 ; Dec 1977 ; Jan 1978 ; Feb 1978 ; Mar 1978 ; Apr 1978 ; May 1978 ; Jun 1978 ; Jul 1978 ; Aug 1978 ; Sep 1978 ; Oct 1978 ; Nov 1978 ; Dec 1978 ; Jan 1979 ; Feb 1979 ; Mar 1979 ; Apr 1979 ; May 1979 ; Jun 1979 ; Jul 1979 ; Aug 1979 ; Sep 1979 ; Oct 1979 ; Nov 1979 ; Dec 1979 ; Jan 1980 ; Feb 1980 ; Mar 1980 ; Apr 1980 ; May 1980 ; Jun 1980 ; Jul 1980 ; Aug 1980 ; Sep 1980 ; Oct 1980 ; Nov 1980 ; Dec 1980 ; Jan 1981 ; Feb 1981 ; Mar 1981 ; Apr 1981 ; May 1981 ; Jun 1981 ; Jul 1981 ; Aug 1981 ; Sep 1981 ; Oct 1981 ; Nov 1981 ; Dec 1981 ; Jan 1982 ; Feb 1982 ; Mar 1982 ; Apr 1982 ; May 1982 ; Jun 1982 ; Jul 1982 ; Aug 1982 ; Sep 1982 ; Oct 1982 ; Nov 1982 ; Dec 1982 ; Jan 1983 ; Feb 1983 ; Mar 1983 ; Apr 1983 ; May 1983 ; Jun 1983 ; Jul 1983 ; Aug 1983 ; Sep 1983 ; Oct 1983 ; Nov 1983 ; Dec 1983 ; Jan 1984 ; Feb 1984 ; Mar 1984 ; Apr 1984 ; May 1984 ; Jun 1984 ; Jul 1984 ; Aug 1984 ; Sep 1984 ; Oct 1984 ; Nov 1984 ; Dec 1984 ; Jan 1985 ; Feb 1985 ; Mar 1985 ; Apr 1985 ; May 1985 ; Jun 1985 ; Jul 1985 ; Aug 1985 ; Sep 1985 ; Oct 1985 ; Nov 1985 ; Dec 1985 ; Jan 1986 ; Feb 1986 ; Mar 1986 ; Apr 1986 ; May 1986 ; Jun 1986 ; Jul 1986 ; Aug 1986 ; Sep 1986 ; Oct 1986 ; Nov 1986 ; Dec 1986 ; Jan 1987 ; Feb 1987 ; Mar 1987 ; Apr 1987 ; May 1987 ; Jun 1987 ; Jul 1987 ; Aug 1987 ; Sep 1987 ; Oct 1987 ; Nov 1987 ; Dec 1987 ; Jan 1988 ; Feb 1988 ; Mar 1988 ; Apr 1988 ; May 1988 ; Jun 1988 ; Jul 1988 ; Aug 1988 ; Sep 1988 ; Oct 1988 ; Nov 1988 ; Dec 1988 ; Jan 1989 ; Feb 1989 ; Mar 1989 ; Apr 1989 ; May 1989 ; Jun 1989 ; Jul 1989 ; Aug 1989 ; Sep 1989 ; Oct 1989 ; Nov 1989 ; Dec 1989 ; Jan 1990 ; Feb 1990 ; Mar 1990 ; Apr 1990 ; May 1990 ; Jun 1990 ; Jul 1990 ; Aug 1990 ; Sep 1990 ; Oct 1990 ; Nov 1990 ; Dec 1990 ; Jan 1991 ; Feb 1991 ; Mar 1991 ; Apr 1991 ; May 1991 ; Jun 1991 ; Jul 1991 ; Aug 1991 ; Sep 1991 ; Oct 1991 ; Nov 1991 ; Dec 1991 ; Jan 1992 ; Feb 1992 ; Mar 1992 ; Apr 1992 ; May 1992 ; Jun 1992 ; Jul 1992 ; Aug 1992 ; Sep 1992 ; Oct 1992 ; Nov 1992 ; Dec 1992 ; Jan 1993 ; Feb 1993 ; Mar 1993 ; Apr 1993 ; May 1993 ; Jun 1993 ; Jul 1993 ; Aug 1993 ; Sep 1993 ; Oct 1993 ; Nov 1993 ; Dec 1993 ; Jan 1994 ; Feb 1994 ; Mar 1994 ; Apr 1994 ; May 1994 ; Jun 1994 ; Jul 1994 ; Aug 1994 ; Sep 1994 ; Oct 1994 ; Nov 1994 ; Dec 1994 ; Jan 1995 ; Feb 1995 ; Mar 1995 ; Apr 1995 ; May 1995 ; Jun 1995 ; Jul 1995 ; Aug 1995 ; Sep 1995 ; Oct 1995 ; Nov 1995 ; Dec 1995 ; Jan 1996 ; Feb 1996 ; Mar 1996 ; Apr 1996 ; May 1996 ; Jun 1996 ; Jul 1996 ; Aug 1996 ; Sep 1996 ; Oct 1996 ; Nov 1996 ; Dec 1996"
unemIndex2 <- "Jan 1997 ; Feb 1997 ; Mar 1997 ; Apr 1997 ; May 1997 ; Jun 1997 ; Jul 1997 ; Aug 1997 ; Sep 1997 ; Oct 1997 ; Nov 1997 ; Dec 1997 ; Jan 1998 ; Feb 1998 ; Mar 1998 ; Apr 1998 ; May 1998 ; Jun 1998 ; Jul 1998 ; Aug 1998 ; Sep 1998 ; Oct 1998 ; Nov 1998 ; Dec 1998 ; Jan 1999 ; Feb 1999 ; Mar 1999 ; Apr 1999 ; May 1999 ; Jun 1999 ; Jul 1999 ; Aug 1999 ; Sep 1999 ; Oct 1999 ; Nov 1999 ; Dec 1999 ; Jan 2000 ; Feb 2000 ; Mar 2000 ; Apr 2000 ; May 2000 ; Jun 2000 ; Jul 2000 ; Aug 2000 ; Sep 2000 ; Oct 2000 ; Nov 2000 ; Dec 2000 ; Jan 2001 ; Feb 2001 ; Mar 2001 ; Apr 2001 ; May 2001 ; Jun 2001 ; Jul 2001 ; Aug 2001 ; Sep 2001 ; Oct 2001 ; Nov 2001 ; Dec 2001 ; Jan 2002 ; Feb 2002 ; Mar 2002 ; Apr 2002 ; May 2002 ; Jun 2002 ; Jul 2002 ; Aug 2002 ; Sep 2002 ; Oct 2002 ; Nov 2002 ; Dec 2002 ; Jan 2003 ; Feb 2003 ; Mar 2003 ; Apr 2003 ; May 2003 ; Jun 2003 ; Jul 2003 ; Aug 2003 ; Sep 2003 ; Oct 2003 ; Nov 2003 ; Dec 2003 ; Jan 2004 ; Feb 2004 ; Mar 2004 ; Apr 2004 ; May 2004 ; Jun 2004 ; Jul 2004 ; Aug 2004 ; Sep 2004 ; Oct 2004 ; Nov 2004 ; Dec 2004 ; Jan 2005 ; Feb 2005 ; Mar 2005 ; Apr 2005 ; May 2005 ; Jun 2005 ; Jul 2005 ; Aug 2005 ; Sep 2005 ; Oct 2005 ; Nov 2005 ; Dec 2005 ; Jan 2006 ; Feb 2006 ; Mar 2006 ; Apr 2006 ; May 2006 ; Jun 2006 ; Jul 2006 ; Aug 2006 ; Sep 2006 ; Oct 2006 ; Nov 2006 ; Dec 2006 ; Jan 2007 ; Feb 2007 ; Mar 2007 ; Apr 2007 ; May 2007 ; Jun 2007 ; Jul 2007 ; Aug 2007 ; Sep 2007 ; Oct 2007 ; Nov 2007 ; Dec 2007 ; Jan 2008 ; Feb 2008 ; Mar 2008 ; Apr 2008 ; May 2008 ; Jun 2008 ; Jul 2008 ; Aug 2008 ; Sep 2008 ; Oct 2008 ; Nov 2008 ; Dec 2008 ; Jan 2009 ; Feb 2009 ; Mar 2009 ; Apr 2009 ; May 2009 ; Jun 2009 ; Jul 2009 ; Aug 2009 ; Sep 2009 ; Oct 2009 ; Nov 2009 ; Dec 2009 ; Jan 2010 ; Feb 2010 ; Mar 2010 ; Apr 2010 ; May 2010 ; Jun 2010 ; Jul 2010 ; Aug 2010 ; Sep 2010 ; Oct 2010 ; Nov 2010 ; Dec 2010 ; Jan 2011 ; Feb 2011 ; Mar 2011 ; Apr 2011 ; May 2011 ; Jun 2011 ; Jul 2011 ; Aug 2011 ; Sep 2011 ; Oct 2011 ; Nov 2011 ; Dec 2011 ; Jan 2012 ; Feb 2012 ; Mar 2012 ; Apr 2012 ; May 2012 ; Jun 2012 ; Jul 2012 ; Aug 2012 ; Sep 2012 ; Oct 2012 ; Nov 2012 ; Dec 2012 ; Jan 2013 ; Feb 2013 ; Mar 2013 ; Apr 2013 ; May 2013 ; Jun 2013 ; Jul 2013 ; Aug 2013 ; Sep 2013 ; Oct 2013 ; Nov 2013 ; Dec 2013 ; Jan 2014 ; Feb 2014 ; Mar 2014 ; Apr 2014 ; May 2014 ; Jun 2014 ; Jul 2014 ; Aug 2014 ; Sep 2014 ; Oct 2014 ; Nov 2014 ; Dec 2014 ; Jan 2015 ; Feb 2015 ; Mar 2015 ; Apr 2015 ; May 2015 ; Jun 2015 ; Jul 2015 ; Aug 2015 ; Sep 2015 ; Oct 2015 ; Nov 2015 ; Dec 2015"
unemCore <- paste(unemCore1, unemCore2, sep=" ; ")
unemIndex <- paste(unemIndex1, unemIndex2, sep=" ; ")
mtxCore <- matrix(data=as.numeric(strsplit(unemCore, " ; ")[[1]]), ncol=2, byrow=FALSE)
## Warning in matrix(data = as.numeric(strsplit(unemCore, " ; ")[[1]]), ncol =
## 2, : NAs introduced by coercion
colnames(mtxCore) <- c("us", "ma")
vecIndex <- as.yearmon(strsplit(unemIndex, " ; ")[[1]], "%b %Y")
unemployment <- as.xts(mtxCore, order.by=vecIndex)
str(unemployment)
## An 'xts' object on Jan 1976/Dec 2015 containing:
## Data: num [1:480, 1:2] 7.9 7.7 7.6 7.7 7.4 7.6 NA NA 7.6 7.7 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "us" "ma"
## Indexed by objects of class: [yearmon] TZ:
## xts Attributes:
## NULL
# View a summary of your unemployment data
summary(unemployment)
## Index us ma
## Min. :1976 Min. : 3.800 Min. : 2.100
## 1st Qu.:1986 1st Qu.: 5.300 1st Qu.: 4.300
## Median :1996 Median : 6.000 Median : 5.500
## Mean :1996 Mean : 6.365 Mean : 5.612
## 3rd Qu.:2006 3rd Qu.: 7.300 3rd Qu.: 6.800
## Max. :2016 Max. :10.800 Max. :11.600
## NA's :73 NA's :20
# Use na.approx to remove missing values in unemployment data
unemployment <- na.approx(unemployment)
# Plot new unemployment data
lty <- c(1, 2)
plot.zoo(unemployment, plot.type = "single", lty = lty)
labels <- c("US Unemployment (%)" , "MA Unemployment (%)")
legend("topright", lty = lty, legend = labels, bg = "white")
# Create a one month lag of US unemployment
us_monthlag <- stats::lag(unemployment$us, k = 1) # caution that dplyr::lag can mask stats::lag
# Create a one year lag of US unemployment
us_yearlag <- stats::lag(unemployment$us, k = 12) # caution that dplyr::lag can mask stats::lag
# Merge your original data with your new lags
unemployment_lags <- merge(unemployment, us_monthlag, us_yearlag)
# View the first 15 rows of unemployment_lags
head(unemployment_lags, n=15)
## us ma us.1 us.2
## Jan 1976 7.90 11.60 NA NA
## Feb 1976 7.70 11.25 7.90 NA
## Mar 1976 7.60 10.90 7.70 NA
## Apr 1976 7.70 10.15 7.60 NA
## May 1976 7.40 9.40 7.70 NA
## Jun 1976 7.60 9.80 7.40 NA
## Jul 1976 7.60 9.70 7.60 NA
## Aug 1976 7.60 9.00 7.60 NA
## Sep 1976 7.60 9.00 7.60 NA
## Oct 1976 7.70 8.30 7.60 NA
## Nov 1976 7.80 8.30 7.70 NA
## Dec 1976 7.65 8.20 7.80 NA
## Jan 1977 7.50 9.50 7.65 7.9
## Feb 1977 7.60 9.20 7.50 7.7
## Mar 1977 7.40 8.80 7.60 7.6
# Generate monthly difference in unemployment
unemployment$us_monthlydiff <- diff(unemployment$us, lag = 1, differences = 1)
# Generate yearly difference in unemployment
unemployment$us_yearlydiff <- diff(unemployment$us, lag = 12, differences = 1)
# Plot US unemployment and annual difference
par(mfrow = c(2,1))
plot.xts(unemployment$us)
plot.xts(unemployment$us_yearlydiff, type = "h")
par(mfrow=c(1, 1))
# Add a quarterly difference in gdp
gdp_xts <- na.approx(gdp_xts)
gdp_xts$quarterly_diff <- diff(gdp_xts$gdp, lag = 1, differences = 1)
# Split gdp$quarterly_diff into years
gdpchange_years <- split(gdp_xts$quarterly_diff, f = "years")
# Use lapply to calculate the cumsum each year
gdpchange_ytd <- lapply(gdpchange_years, FUN = cumsum)
# Use do.call to rbind the results
gdpchange_xts <- do.call(rbind, gdpchange_ytd)
# Plot cumulative year-to-date change in GDP
plot.xts(gdpchange_xts, type = "h")
# Use rollapply to calculate the rolling yearly average US unemployment
unemployment$year_avg <- rollapply(unemployment$us, width = 12, FUN = mean)
# Plot all columns of US unemployment data
lty <- c(2, 1)
lwd <- c(1, 2)
plot.zoo(unemployment[, c("us", "year_avg")], plot.type = "single", lty = lty, lwd = lwd)
# Add a one-year lag of MA unemployment
unemployment$ma_yearlag <- stats::lag(unemployment$ma, k=12) # caution that dplyr::lag can mask stats::lag
# Add a six-month difference of MA unemployment
unemployment$ma_sixmonthdiff <- diff(unemployment$ma, lag=6, differences=1)
# Add a six-month rolling average of MA unemployment
unemployment$ma_sixmonthavg <- rollapply(unemployment$ma, width=6, FUN=mean)
# Add a yearly rolling maximum of MA unemployment
unemployment$ma_yearmax <- rollapply(unemployment$ma, width=12, FUN=max)
# View the last year of unemployment data
tail(unemployment, n=12)
## us ma us_monthlydiff us_yearlydiff year_avg ma_yearlag
## Jan 2015 5.7 5.8 0.1 -0.9 6.091667 6.80
## Feb 2015 5.5 5.5 -0.2 -1.2 5.991667 6.50
## Mar 2015 5.5 5.2 0.0 -1.2 5.891667 6.20
## Apr 2015 5.4 4.7 -0.1 -0.8 5.825000 5.50
## May 2015 5.5 4.9 0.1 -0.7 5.766667 5.50
## Jun 2015 5.3 5.2 -0.2 -0.8 5.700000 6.00
## Jul 2015 5.3 5.2 0.0 -0.9 5.625000 6.00
## Aug 2015 5.1 4.7 -0.2 -1.1 5.533333 5.85
## Sep 2015 5.1 4.9 0.0 -0.9 5.458333 5.70
## Oct 2015 5.0 4.5 -0.1 -0.7 5.400000 5.00
## Nov 2015 5.0 4.5 0.0 -0.8 5.333333 5.00
## Dec 2015 5.0 4.6 0.0 -0.6 5.283333 4.90
## ma_sixmonthdiff ma_sixmonthavg ma_yearmax
## Jan 2015 -0.20 5.375000 6.50
## Feb 2015 -0.35 5.316667 6.20
## Mar 2015 -0.50 5.233333 6.00
## Apr 2015 -0.30 5.183333 6.00
## May 2015 -0.10 5.166667 6.00
## Jun 2015 0.30 5.216667 6.00
## Jul 2015 -0.60 5.116667 5.85
## Aug 2015 -0.80 4.983333 5.80
## Sep 2015 -0.30 4.933333 5.80
## Oct 2015 -0.20 4.900000 5.80
## Nov 2015 -0.40 4.833333 5.80
## Dec 2015 -0.60 4.733333 5.80
Chapter 4 - Sports Data
Advanced features of xts:
Indexing commands in xts:
Example code includes:
rsDate1 <- "2010-04-04 ; 2010-04-06 ; 2010-04-07 ; 2010-04-16 ; 2010-04-17 ; 2010-04-18 ; 2010-04-19 ; 2010-04-20 ; 2010-04-21 ; 2010-04-22 ; 2010-04-23 ; 2010-04-24 ; 2010-04-25 ; 2010-05-03 ; 2010-05-04 ; 2010-05-05 ; 2010-05-06 ; 2010-05-07 ; 2010-05-08 ; 2010-05-09 ; 2010-05-10 ; 2010-05-11 ; 2010-05-12 ; 2010-05-19 ; 2010-05-20 ; 2010-05-27 ; 2010-05-28 ; 2010-05-29 ; 2010-05-30 ; 2010-06-01 ; 2010-06-02 ; 2010-06-03 ; 2010-06-11 ; 2010-06-12 ; 2010-06-13 ; 2010-06-15 ; 2010-06-16 ; 2010-06-17 ; 2010-06-18 ; 2010-06-19 ; 2010-06-20 ; 2010-06-29 ; 2010-06-30 ; 2010-07-02 ; 2010-07-03 ; 2010-07-04 ; 2010-07-15 ; 2010-07-16 ; 2010-07-17 ; 2010-07-18 ; 2010-07-30 ; 2010-07-31 ; 2010-08-01 ; 2010-08-02 ; 2010-08-03 ; 2010-08-04 ; 2010-08-05 ; 2010-08-17 ; 2010-08-18 ; 2010-08-19 ; 2010-08-20 ; 2010-08-21 ; 2010-08-22 ; 2010-08-23 ; 2010-08-25 ; 2010-08-25 ; 2010-09-04 ; 2010-09-04 ; 2010-09-05 ; 2010-09-06 ; 2010-09-07 ; 2010-09-08 ; 2010-09-17 ; 2010-09-18 ; 2010-09-19 ; 2010-09-20 ; 2010-09-21 ; 2010-09-22 ; 2010-10-02 ; 2010-10-02 ; 2010-10-03 ; 2011-04-08 ; 2011-04-09 ; 2011-04-10 ; 2011-04-11 ; 2011-04-12 ; 2011-04-15 ; 2011-04-16 ; 2011-04-17 ; 2011-04-18 ; 2011-04-29 ; 2011-04-30 ; 2011-05-01 ; 2011-05-02 ; 2011-05-03 ; 2011-05-04 ; 2011-05-05 ; 2011-05-06 ; 2011-05-07 ; 2011-05-08 ; 2011-05-09 ; 2011-05-16 ; 2011-05-18 ; 2011-05-19 ; 2011-05-20 ; 2011-05-21 ; 2011-05-22 ; 2011-05-30 ; 2011-05-31 ; 2011-06-01 ; 2011-06-03 ; 2011-06-04 ; 2011-06-05 ; 2011-06-17 ; 2011-06-18 ; 2011-06-19 ; 2011-06-20 ; 2011-06-21 ; 2011-06-22 ; 2011-07-04 ; 2011-07-05 ; 2011-07-06 ; 2011-07-07 ; 2011-07-08 ; 2011-07-09 ; 2011-07-10 ; 2011-07-22 ; 2011-07-23 ; 2011-07-24 ; 2011-07-25 ; 2011-07-26 ; 2011-07-27 ; 2011-07-28 ; 2011-08-01 ; 2011-08-02 ; 2011-08-03 ; 2011-08-04 ; 2011-08-05 ; 2011-08-06 ; 2011-08-07 ; 2011-08-16 ; 2011-08-16 ; 2011-08-17 ; 2011-08-26 ; 2011-08-27 ; 2011-08-27 ; 2011-08-30 ; 2011-08-31 ; 2011-09-01 ; 2011-09-02 ; 2011-09-03 ; 2011-09-04 ; 2011-09-13 ; 2011-09-14 ; 2011-09-15 ; 2011-09-16 ; 2011-09-17 ; 2011-09-18 ; 2011-09-19 ; 2011-09-19 ; 2011-09-20 ; 2011-09-21 ; 2012-04-13 ; 2012-04-14 ; 2012-04-15 ; 2012-04-16 ; 2012-04-17 ; 2012-04-18 ; 2012-04-20 ; 2012-04-21 ; 2012-04-30 ; 2012-05-01 ; 2012-05-02 ; 2012-05-04 ; 2012-05-05 ; 2012-05-06 ; 2012-05-10 ; 2012-05-11 ; 2012-05-12 ; 2012-05-13 ; 2012-05-14 ; 2012-05-15 ; 2012-05-25 ; 2012-05-26 ; 2012-05-27 ; 2012-05-28 ; 2012-05-29 ; 2012-05-30 ; 2012-05-31 ; 2012-06-05 ; 2012-06-06 ; 2012-06-07 ; 2012-06-08 ; 2012-06-09 ; 2012-06-10 ; 2012-06-19 ; 2012-06-20 ; 2012-06-21 ; 2012-06-22 ; 2012-06-23 ; 2012-06-24 ; 2012-06-25 ; 2012-06-26 ; 2012-06-27 ; 2012-07-06 ; 2012-07-07 ; 2012-07-07 ; 2012-07-08 ; 2012-07-16 ; 2012-07-17 ; 2012-07-18 ; 2012-07-19 ; 2012-07-20 ; 2012-07-21 ; 2012-07-22 ; 2012-07-30 ; 2012-07-31 ; 2012-08-01 ; 2012-08-02 ; 2012-08-03 ; 2012-08-04 ; 2012-08-05 ; 2012-08-06 ; 2012-08-07 ; 2012-08-08 ; 2012-08-21 ; 2012-08-22 ; 2012-08-23 ; 2012-08-24 ; 2012-08-25 ; 2012-08-26 ; 2012-08-27 ; 2012-09-07 ; 2012-09-08 ; 2012-09-09 ; 2012-09-11 ; 2012-09-12 ; 2012-09-13 ; 2012-09-21 ; 2012-09-22 ; 2012-09-23 ; 2012-09-25 ; 2012-09-26"
rsDate2 <- "2013-04-08 ; 2013-04-10 ; 2013-04-11 ; 2013-04-13 ; 2013-04-14 ; 2013-04-15 ; 2013-04-20 ; 2013-04-21 ; 2013-04-21 ; 2013-04-22 ; 2013-04-23 ; 2013-04-24 ; 2013-04-25 ; 2013-04-26 ; 2013-04-27 ; 2013-04-28 ; 2013-05-06 ; 2013-05-07 ; 2013-05-08 ; 2013-05-09 ; 2013-05-10 ; 2013-05-11 ; 2013-05-12 ; 2013-05-23 ; 2013-05-24 ; 2013-05-25 ; 2013-05-26 ; 2013-05-27 ; 2013-05-28 ; 2013-06-04 ; 2013-06-05 ; 2013-06-06 ; 2013-06-08 ; 2013-06-08 ; 2013-06-09 ; 2013-06-18 ; 2013-06-18 ; 2013-06-19 ; 2013-06-25 ; 2013-06-26 ; 2013-06-27 ; 2013-06-28 ; 2013-06-29 ; 2013-06-30 ; 2013-07-02 ; 2013-07-03 ; 2013-07-04 ; 2013-07-19 ; 2013-07-20 ; 2013-07-21 ; 2013-07-22 ; 2013-07-23 ; 2013-07-24 ; 2013-07-29 ; 2013-07-30 ; 2013-07-31 ; 2013-08-01 ; 2013-08-02 ; 2013-08-03 ; 2013-08-04 ; 2013-08-16 ; 2013-08-17 ; 2013-08-18 ; 2013-08-27 ; 2013-08-28 ; 2013-08-29 ; 2013-08-30 ; 2013-08-31 ; 2013-09-01 ; 2013-09-02 ; 2013-09-03 ; 2013-09-04 ; 2013-09-13 ; 2013-09-14 ; 2013-09-15 ; 2013-09-17 ; 2013-09-18 ; 2013-09-19 ; 2013-09-20 ; 2013-09-21 ; 2013-09-22 ; 2014-04-04 ; 2014-04-05 ; 2014-04-06 ; 2014-04-07 ; 2014-04-08 ; 2014-04-09 ; 2014-04-18 ; 2014-04-19 ; 2014-04-20 ; 2014-04-21 ; 2014-04-22 ; 2014-04-23 ; 2014-04-24 ; 2014-04-29 ; 2014-05-01 ; 2014-05-01 ; 2014-05-02 ; 2014-05-03 ; 2014-05-04 ; 2014-05-06 ; 2014-05-07 ; 2014-05-16 ; 2014-05-17 ; 2014-05-18 ; 2014-05-20 ; 2014-05-21 ; 2014-05-22 ; 2014-05-28 ; 2014-05-29 ; 2014-05-30 ; 2014-05-31 ; 2014-06-01 ; 2014-06-12 ; 2014-06-13 ; 2014-06-14 ; 2014-06-15 ; 2014-06-16 ; 2014-06-17 ; 2014-06-18 ; 2014-06-30 ; 2014-07-01 ; 2014-07-02 ; 2014-07-05 ; 2014-07-05 ; 2014-07-06 ; 2014-07-07 ; 2014-07-08 ; 2014-07-09 ; 2014-07-10 ; 2014-07-18 ; 2014-07-19 ; 2014-07-20 ; 2014-07-28 ; 2014-07-29 ; 2014-07-30 ; 2014-08-01 ; 2014-08-02 ; 2014-08-03 ; 2014-08-14 ; 2014-08-15 ; 2014-08-16 ; 2014-08-17 ; 2014-08-18 ; 2014-08-19 ; 2014-08-20 ; 2014-08-21 ; 2014-08-22 ; 2014-08-23 ; 2014-08-24 ; 2014-09-05 ; 2014-09-06 ; 2014-09-07 ; 2014-09-08 ; 2014-09-09 ; 2014-09-10 ; 2014-09-23 ; 2014-09-24 ; 2014-09-25 ; 2014-09-26 ; 2014-09-27 ; 2014-09-28 ; 2015-04-13 ; 2015-04-14 ; 2015-04-15 ; 2015-04-17 ; 2015-04-18 ; 2015-04-19 ; 2015-04-20 ; 2015-04-27 ; 2015-04-28 ; 2015-04-29 ; 2015-05-01 ; 2015-05-02 ; 2015-05-03 ; 2015-05-04 ; 2015-05-05 ; 2015-05-06 ; 2015-05-19 ; 2015-05-20 ; 2015-05-21 ; 2015-05-22 ; 2015-05-23 ; 2015-05-24 ; 2015-06-02 ; 2015-06-03 ; 2015-06-03 ; 2015-06-04 ; 2015-06-05 ; 2015-06-06 ; 2015-06-07 ; 2015-06-12 ; 2015-06-13 ; 2015-06-14 ; 2015-06-15 ; 2015-06-16 ; 2015-06-23 ; 2015-06-24 ; 2015-06-25 ; 2015-07-03 ; 2015-07-04 ; 2015-07-05 ; 2015-07-07 ; 2015-07-08 ; 2015-07-10 ; 2015-07-11 ; 2015-07-12 ; 2015-07-24 ; 2015-07-25 ; 2015-07-26 ; 2015-07-27 ; 2015-07-28 ; 2015-07-29 ; 2015-07-30 ; 2015-07-31 ; 2015-08-01 ; 2015-08-02 ; 2015-08-14 ; 2015-08-15 ; 2015-08-16 ; 2015-08-17 ; 2015-08-18 ; 2015-08-19 ; 2015-08-20 ; 2015-08-21 ; 2015-08-22 ; 2015-08-23 ; 2015-08-31 ; 2015-09-01 ; 2015-09-02 ; 2015-09-04 ; 2015-09-05 ; 2015-09-06 ; 2015-09-07 ; 2015-09-08 ; 2015-09-09 ; 2015-09-21 ; 2015-09-22 ; 2015-09-23 ; 2015-09-24 ; 2015-09-25 ; 2015-09-26 ; 2015-09-27"
rsDate3 <- "2010-04-09 ; 2010-04-10 ; 2010-04-11 ; 2010-04-12 ; 2010-04-14 ; 2010-04-15 ; 2010-04-26 ; 2010-04-27 ; 2010-04-28 ; 2010-04-30 ; 2010-05-01 ; 2010-05-02 ; 2010-05-14 ; 2010-05-15 ; 2010-05-16 ; 2010-05-17 ; 2010-05-18 ; 2010-05-21 ; 2010-05-22 ; 2010-05-23 ; 2010-05-24 ; 2010-05-25 ; 2010-05-26 ; 2010-06-04 ; 2010-06-05 ; 2010-06-06 ; 2010-06-07 ; 2010-06-08 ; 2010-06-09 ; 2010-06-10 ; 2010-06-22 ; 2010-06-23 ; 2010-06-24 ; 2010-06-25 ; 2010-06-26 ; 2010-06-27 ; 2010-07-05 ; 2010-07-06 ; 2010-07-07 ; 2010-07-09 ; 2010-07-10 ; 2010-07-11 ; 2010-07-19 ; 2010-07-20 ; 2010-07-21 ; 2010-07-22 ; 2010-07-23 ; 2010-07-24 ; 2010-07-25 ; 2010-07-26 ; 2010-07-27 ; 2010-07-28 ; 2010-08-06 ; 2010-08-07 ; 2010-08-08 ; 2010-08-09 ; 2010-08-10 ; 2010-08-11 ; 2010-08-12 ; 2010-08-13 ; 2010-08-14 ; 2010-08-15 ; 2010-08-27 ; 2010-08-28 ; 2010-08-29 ; 2010-08-31 ; 2010-09-01 ; 2010-09-02 ; 2010-09-10 ; 2010-09-11 ; 2010-09-12 ; 2010-09-13 ; 2010-09-14 ; 2010-09-15 ; 2010-09-24 ; 2010-09-25 ; 2010-09-26 ; 2010-09-27 ; 2010-09-28 ; 2010-09-29 ; 2010-09-30 ; 2011-04-01 ; 2011-04-02 ; 2011-04-03 ; 2011-04-05 ; 2011-04-06 ; 2011-04-07 ; 2011-04-19 ; 2011-04-20 ; 2011-04-21 ; 2011-04-22 ; 2011-04-23 ; 2011-04-24 ; 2011-04-26 ; 2011-04-27 ; 2011-04-28 ; 2011-05-10 ; 2011-05-11 ; 2011-05-13 ; 2011-05-14 ; 2011-05-15 ; 2011-05-23 ; 2011-05-24 ; 2011-05-25 ; 2011-05-26 ; 2011-05-27 ; 2011-05-29 ; 2011-05-29 ; 2011-06-07 ; 2011-06-08 ; 2011-06-09 ; 2011-06-10 ; 2011-06-11 ; 2011-06-12 ; 2011-06-14 ; 2011-06-15 ; 2011-06-16 ; 2011-06-24 ; 2011-06-25 ; 2011-06-26 ; 2011-06-28 ; 2011-06-29 ; 2011-06-30 ; 2011-07-01 ; 2011-07-02 ; 2011-07-03 ; 2011-07-15 ; 2011-07-16 ; 2011-07-17 ; 2011-07-18 ; 2011-07-19 ; 2011-07-20 ; 2011-07-29 ; 2011-07-30 ; 2011-07-31 ; 2011-08-08 ; 2011-08-09 ; 2011-08-10 ; 2011-08-12 ; 2011-08-13 ; 2011-08-14 ; 2011-08-18 ; 2011-08-19 ; 2011-08-20 ; 2011-08-21 ; 2011-08-22 ; 2011-08-23 ; 2011-08-24 ; 2011-08-25 ; 2011-09-05 ; 2011-09-06 ; 2011-09-07 ; 2011-09-08 ; 2011-09-09 ; 2011-09-10 ; 2011-09-11 ; 2011-09-24 ; 2011-09-25 ; 2011-09-25 ; 2011-09-26 ; 2011-09-27 ; 2011-09-28 ; 2012-04-05 ; 2012-04-07 ; 2012-04-08 ; 2012-04-09 ; 2012-04-10 ; 2012-04-11 ; 2012-04-23 ; 2012-04-24 ; 2012-04-25 ; 2012-04-26 ; 2012-04-27 ; 2012-04-28 ; 2012-04-29 ; 2012-05-07 ; 2012-05-08 ; 2012-05-09 ; 2012-05-16 ; 2012-05-17 ; 2012-05-18 ; 2012-05-19 ; 2012-05-20 ; 2012-05-21 ; 2012-05-22 ; 2012-05-23 ; 2012-06-01 ; 2012-06-02 ; 2012-06-03 ; 2012-06-11 ; 2012-06-12 ; 2012-06-13 ; 2012-06-15 ; 2012-06-16 ; 2012-06-17 ; 2012-06-28 ; 2012-06-29 ; 2012-06-30 ; 2012-07-01 ; 2012-07-02 ; 2012-07-03 ; 2012-07-04 ; 2012-07-13 ; 2012-07-14 ; 2012-07-15 ; 2012-07-23 ; 2012-07-24 ; 2012-07-25 ; 2012-07-27 ; 2012-07-28 ; 2012-07-29 ; 2012-08-09 ; 2012-08-10 ; 2012-08-11 ; 2012-08-12 ; 2012-08-14 ; 2012-08-15 ; 2012-08-16 ; 2012-08-17 ; 2012-08-18 ; 2012-08-19 ; 2012-08-28 ; 2012-08-29 ; 2012-08-30 ; 2012-08-31 ; 2012-09-01 ; 2012-09-02 ; 2012-09-03 ; 2012-09-04 ; 2012-09-05 ; 2012-09-14 ; 2012-09-15 ; 2012-09-16 ; 2012-09-17 ; 2012-09-18 ; 2012-09-19 ; 2012-09-20 ; 2012-09-28 ; 2012-09-29 ; 2012-09-30 ; 2012-10-01 ; 2012-10-02 ; 2012-10-03"
rsDate4 <- "2013-04-01 ; 2013-04-03 ; 2013-04-04 ; 2013-04-05 ; 2013-04-06 ; 2013-04-07 ; 2013-04-16 ; 2013-04-17 ; 2013-04-18 ; 2013-04-30 ; 2013-05-01 ; 2013-05-02 ; 2013-05-03 ; 2013-05-04 ; 2013-05-05 ; 2013-05-14 ; 2013-05-15 ; 2013-05-16 ; 2013-05-17 ; 2013-05-18 ; 2013-05-19 ; 2013-05-20 ; 2013-05-21 ; 2013-05-22 ; 2013-05-29 ; 2013-05-30 ; 2013-05-31 ; 2013-06-01 ; 2013-06-02 ; 2013-06-10 ; 2013-06-11 ; 2013-06-12 ; 2013-06-13 ; 2013-06-14 ; 2013-06-15 ; 2013-06-16 ; 2013-06-20 ; 2013-06-21 ; 2013-06-22 ; 2013-06-23 ; 2013-07-05 ; 2013-07-06 ; 2013-07-07 ; 2013-07-08 ; 2013-07-09 ; 2013-07-10 ; 2013-07-11 ; 2013-07-12 ; 2013-07-13 ; 2013-07-14 ; 2013-07-26 ; 2013-07-27 ; 2013-07-28 ; 2013-08-05 ; 2013-08-06 ; 2013-08-07 ; 2013-08-08 ; 2013-08-09 ; 2013-08-10 ; 2013-08-11 ; 2013-08-13 ; 2013-08-14 ; 2013-08-15 ; 2013-08-19 ; 2013-08-20 ; 2013-08-21 ; 2013-08-23 ; 2013-08-24 ; 2013-08-25 ; 2013-09-05 ; 2013-09-06 ; 2013-09-07 ; 2013-09-08 ; 2013-09-10 ; 2013-09-11 ; 2013-09-12 ; 2013-09-24 ; 2013-09-25 ; 2013-09-27 ; 2013-09-28 ; 2013-09-29 ; 2014-03-31 ; 2014-04-02 ; 2014-04-03 ; 2014-04-10 ; 2014-04-11 ; 2014-04-12 ; 2014-04-13 ; 2014-04-15 ; 2014-04-16 ; 2014-04-17 ; 2014-04-25 ; 2014-04-26 ; 2014-04-27 ; 2014-05-09 ; 2014-05-10 ; 2014-05-11 ; 2014-05-13 ; 2014-05-14 ; 2014-05-15 ; 2014-05-23 ; 2014-05-24 ; 2014-05-25 ; 2014-05-26 ; 2014-05-27 ; 2014-06-02 ; 2014-06-03 ; 2014-06-04 ; 2014-06-06 ; 2014-06-07 ; 2014-06-08 ; 2014-06-09 ; 2014-06-10 ; 2014-06-11 ; 2014-06-19 ; 2014-06-20 ; 2014-06-21 ; 2014-06-22 ; 2014-06-23 ; 2014-06-24 ; 2014-06-25 ; 2014-06-27 ; 2014-06-28 ; 2014-06-29 ; 2014-07-11 ; 2014-07-12 ; 2014-07-13 ; 2014-07-21 ; 2014-07-22 ; 2014-07-23 ; 2014-07-24 ; 2014-07-25 ; 2014-07-26 ; 2014-07-27 ; 2014-08-05 ; 2014-08-06 ; 2014-08-07 ; 2014-08-08 ; 2014-08-09 ; 2014-08-10 ; 2014-08-12 ; 2014-08-13 ; 2014-08-25 ; 2014-08-26 ; 2014-08-27 ; 2014-08-29 ; 2014-08-30 ; 2014-08-31 ; 2014-09-01 ; 2014-09-02 ; 2014-09-03 ; 2014-09-04 ; 2014-09-11 ; 2014-09-12 ; 2014-09-13 ; 2014-09-14 ; 2014-09-16 ; 2014-09-17 ; 2014-09-18 ; 2014-09-19 ; 2014-09-20 ; 2014-09-21 ; 2015-04-06 ; 2015-04-08 ; 2015-04-09 ; 2015-04-10 ; 2015-04-11 ; 2015-04-12 ; 2015-04-21 ; 2015-04-22 ; 2015-04-23 ; 2015-04-24 ; 2015-04-25 ; 2015-04-26 ; 2015-05-08 ; 2015-05-09 ; 2015-05-10 ; 2015-05-11 ; 2015-05-12 ; 2015-05-13 ; 2015-05-14 ; 2015-05-15 ; 2015-05-16 ; 2015-05-17 ; 2015-05-25 ; 2015-05-26 ; 2015-05-27 ; 2015-05-28 ; 2015-05-29 ; 2015-05-30 ; 2015-05-31 ; 2015-06-09 ; 2015-06-10 ; 2015-06-11 ; 2015-06-17 ; 2015-06-18 ; 2015-06-19 ; 2015-06-20 ; 2015-06-21 ; 2015-06-26 ; 2015-06-27 ; 2015-06-28 ; 2015-06-29 ; 2015-06-30 ; 2015-07-01 ; 2015-07-02 ; 2015-07-17 ; 2015-07-18 ; 2015-07-20 ; 2015-07-20 ; 2015-07-21 ; 2015-07-22 ; 2015-07-23 ; 2015-08-04 ; 2015-08-05 ; 2015-08-06 ; 2015-08-07 ; 2015-08-08 ; 2015-08-09 ; 2015-08-11 ; 2015-08-12 ; 2015-08-24 ; 2015-08-25 ; 2015-08-26 ; 2015-08-28 ; 2015-08-29 ; 2015-08-30 ; 2015-09-11 ; 2015-09-12 ; 2015-09-13 ; 2015-09-14 ; 2015-09-15 ; 2015-09-16 ; 2015-09-18 ; 2015-09-19 ; 2015-09-20 ; 2015-09-28 ; 2015-09-29 ; 2015-09-30 ; 2015-10-01 ; 2015-10-02 ; 2015-10-03 ; 2015-10-04"
rsDate <- paste(rsDate1, rsDate2, rsDate3, rsDate4, sep=" ; ")
rsScore <- "9 ; 4 ; 1 ; 1 ; 5 ; 1 ; 2 ; 7 ; 8 ; 0 ; 4 ; 7 ; 6 ; 17 ; 5 ; 3 ; 11 ; 3 ; 3 ; 9 ; 7 ; 6 ; 2 ; 3 ; 6 ; 3 ; 5 ; 1 ; 8 ; 9 ; 6 ; 8 ; 12 ; 10 ; 3 ; 6 ; 6 ; 8 ; 10 ; 5 ; 2 ; 8 ; 4 ; 3 ; 9 ; 1 ; 2 ; 4 ; 3 ; 2 ; 5 ; 5 ; 4 ; 5 ; 3 ; 1 ; 6 ; 6 ; 7 ; 2 ; 2 ; 5 ; 5 ; 6 ; 5 ; 2 ; 1 ; 1 ; 5 ; 12 ; 5 ; 11 ; 9 ; 3 ; 6 ; 2 ; 1 ; 6 ; 5 ; 7 ; 8 ; 9 ; 4 ; 4 ; 5 ; 2 ; 6 ; 4 ; 8 ; 9 ; 4 ; 0 ; 3 ; 9 ; 7 ; 3 ; 0 ; 2 ; 4 ; 9 ; 2 ; 8 ; 1 ; 4 ; 15 ; 3 ; 5 ; 3 ; 7 ; 4 ; 8 ; 9 ; 6 ; 10 ; 2 ; 12 ; 14 ; 4 ; 1 ; 7 ; 3 ; 6 ; 10 ; 10 ; 4 ; 8 ; 7 ; 3 ; 12 ; 1 ; 13 ; 12 ; 3 ; 6 ; 3 ; 4 ; 3 ; 2 ; 10 ; 3 ; 3 ; 2 ; 0 ; 5 ; 9 ; 4 ; 2 ; 9 ; 2 ; 0 ; 12 ; 4 ; 18 ; 4 ; 2 ; 4 ; 3 ; 5 ; 5 ; 18 ; 5 ; 4 ; 12 ; 13 ; 6 ; 0 ; 3 ; 3 ; 2 ; 9 ; 11 ; 3 ; 2 ; 4 ; 2 ; 6 ; 3 ; 7 ; 4 ; 12 ; 6 ; 5 ; 4 ; 3 ; 3 ; 7 ; 6 ; 6 ; 3 ; 6 ; 1 ; 7 ; 4 ; 2 ; 3 ; 7 ; 15 ; 6 ; 1 ; 8 ; 9 ; 6 ; 5 ; 10 ; 8 ; 1 ; 9 ; 3 ; 5 ; 5 ; 10 ; 3 ; 1 ; 3 ; 7 ; 7 ; 4 ; 5 ; 0 ; 5 ; 4 ; 6 ; 9 ; 3 ; 9 ; 3 ; 3 ; 13 ; 4 ; 9 ; 8 ; 5 ; 5 ; 2 ; 3 ; 4 ; 4 ; 0 ; 2 ; 6 ; 2 ; 2 ; 2 ; 3 ; 5 ; 2 ; 2 ; 5 ; 3 ; 4 ; 2 ; 4 ; 9 ; 0 ; 6 ; 7 ; 7 ; 8 ; 6 ; 6 ; 1 ; 8 ; 3 ; 5 ; 2 ; 4 ; 3 ; 8 ; 7 ; 6 ; 9 ; 1 ; 17 ; 2 ; 6 ; 5 ; 7 ; 10 ; 5 ; 3 ; 2 ; 11 ; 5 ; 7 ; 7 ; 2 ; 5 ; 4 ; 2 ; 8 ; 4 ; 2 ; 8 ; 0 ; 6 ; 1 ; 1 ; 8 ; 5 ; 8 ; 6 ; 5 ; 4 ; 3 ; 6 ; 6 ; 13 ; 4 ; 2 ; 4 ; 7 ; 7 ; 0 ; 2 ; 20 ; 8 ; 5 ; 9 ; 2 ; 3 ; 3 ; 6 ; 2 ; 5 ; 2 ; 6 ; 0 ; 5 ; 7 ; 4 ; 4 ; 4 ; 6 ; 6 ; 3 ; 5 ; 5 ; 7 ; 1 ; 5 ; 7 ; 6 ; 2 ; 4 ; 4 ; 0 ; 1 ; 2 ; 4 ; 4 ; 2 ; 4 ; 4 ; 3 ; 7 ; 4 ; 5 ; 10 ; 2 ; 2 ; 1 ; 2 ; 2 ; 0 ; 1 ; 9 ; 3 ; 4 ; 6 ; 0 ; 3 ; 5 ; 4 ; 5 ; 2 ; 6 ; 1 ; 2 ; 1 ; 4 ; 4 ; 7 ; 9 ; 3 ; 10 ; 1 ; 2 ; 3 ; 3 ; 0 ; 3 ; 3 ; 6 ; 9 ; 4 ; 1 ; 0 ; 1 ; 6 ; 2 ; 11 ; 11 ; 2 ; 10 ; 5 ; 9 ; 8 ; 5 ; 3 ; 1 ; 3 ; 7 ; 6 ; 8 ; 4 ; 2 ; 2 ; 5 ; 1 ; 2 ; 3 ; 4 ; 1 ; 1 ; 5 ; 8 ; 6 ; 1 ; 6 ; 0 ; 4 ; 4 ; 4 ; 7 ; 10 ; 4 ; 5 ; 2 ; 9 ; 4 ; 5 ; 6 ; 8 ; 6 ; 5 ; 4 ; 6 ; 1 ; 5 ; 6 ; 2 ; 1 ; 11 ; 8 ; 4 ; 2 ; 8 ; 7 ; 11 ; 3 ; 15 ; 22 ; 8 ; 2 ; 9 ; 6 ; 4 ; 7 ; 3 ; 6 ; 4 ; 1 ; 8 ; 7 ; 9 ; 6 ; 11 ; 1 ; 10 ; 8 ; 2 ; 2 ; 2 ; 7 ; 8 ; 2 ; 3 ; 8 ; 8 ; 2 ; 6 ; 0 ; 13 ; 2 ; 2 ; 4 ; 9 ; 2 ; 7 ; 6 ; 1 ; 9 ; 7 ; 1 ; 5 ; 8 ; 6 ; 2 ; 11 ; 11 ; 8 ; 3 ; 4 ; 3 ; 0 ; 7 ; 1 ; 6 ; 13 ; 4 ; 4 ; 5 ; 5 ; 2 ; 4 ; 14 ; 5 ; 3 ; 2 ; 4 ; 4 ; 8 ; 2 ; 1 ; 2 ; 6 ; 4 ; 7 ; 6 ; 2 ; 2 ; 2 ; 7 ; 10 ; 5 ; 9 ; 3 ; 3 ; 3 ; 2 ; 3 ; 2 ; 9 ; 6 ; 0 ; 3 ; 5 ; 5 ; 9 ; 5 ; 10 ; 7 ; 3 ; 6 ; 4 ; 2 ; 2 ; 5 ; 5 ; 1 ; 1 ; 4 ; 0 ; 0 ; 5 ; 4 ; 4 ; 5 ; 7 ; 1 ; 4 ; 6 ; 6 ; 3 ; 5 ; 6 ; 7 ; 2 ; 4 ; 14 ; 14 ; 6 ; 4 ; 0 ; 6 ; 11 ; 8 ; 5 ; 16 ; 14 ; 0 ; 3 ; 4 ; 1 ; 4 ; 4 ; 0 ; 1 ; 5 ; 7 ; 10 ; 2 ; 6 ; 9 ; 1 ; 15 ; 2 ; 4 ; 1 ; 10 ; 5 ; 8 ; 4 ; 2 ; 6 ; 4 ; 3 ; 4 ; 7 ; 4 ; 6 ; 0 ; 11 ; 13 ; 6 ; 0 ; 14 ; 10 ; 4 ; 2 ; 5 ; 1 ; 1 ; 2 ; 7 ; 3 ; 8 ; 3 ; 2 ; 0 ; 12 ; 4 ; 3 ; 1 ; 6 ; 11 ; 7 ; 10 ; 10 ; 1 ; 1 ; 11 ; 4 ; 3 ; 1 ; 5 ; 4 ; 7 ; 5 ; 8 ; 1 ; 6 ; 7 ; 7 ; 1 ; 1 ; 2 ; 10 ; 0 ; 4 ; 7 ; 0 ; 5 ; 2 ; 2 ; 1 ; 2 ; 2 ; 3 ; 3 ; 7 ; 1 ; 2 ; 3 ; 3 ; 8 ; 3 ; 3 ; 3 ; 2 ; 14 ; 1 ; 3 ; 6 ; 4 ; 4 ; 1 ; 5 ; 3 ; 2 ; 2 ; 1 ; 2 ; 1 ; 4 ; 1 ; 8 ; 3 ; 0 ; 5 ; 7 ; 3 ; 4 ; 1 ; 3 ; 3 ; 2 ; 3 ; 2 ; 8 ; 7 ; 2 ; 6 ; 0 ; 13 ; 7 ; 6 ; 6 ; 7 ; 10 ; 3 ; 0 ; 1 ; 3 ; 3 ; 9 ; 4 ; 3 ; 12 ; 5 ; 4 ; 1 ; 6 ; 3 ; 9 ; 1 ; 11 ; 3 ; 10 ; 3 ; 2 ; 4 ; 0 ; 5 ; 3 ; 3 ; 10 ; 3 ; 5 ; 6 ; 7 ; 0 ; 4 ; 11 ; 11 ; 8 ; 4 ; 0 ; 2 ; 0 ; 7 ; 5 ; 0 ; 15 ; 7 ; 1 ; 6 ; 5 ; 3 ; 4 ; 3 ; 1 ; 7 ; 2 ; 12 ; 0 ; 4 ; 8 ; 9 ; 12 ; 13 ; 3 ; 2 ; 7 ; 3 ; 3 ; 15 ; 12 ; 5 ; 6 ; 1 ; 6 ; 4 ; 1 ; 4 ; 4 ; 2 ; 1 ; 6 ; 3 ; 8 ; 7 ; 1 ; 0 ; 8 ; 5 ; 6 ; 9 ; 3 ; 0 ; 5 ; 5 ; 8 ; 6 ; 2 ; 3 ; 4 ; 2 ; 6 ; 5 ; 0 ; 1 ; 0 ; 2 ; 3 ; 1 ; 7 ; 3 ; 2 ; 5 ; 0 ; 2 ; 8 ; 8 ; 2 ; 11 ; 14 ; 3 ; 4 ; 0 ; 4 ; 0 ; 3 ; 2 ; 2 ; 2 ; 4 ; 4 ; 3 ; 3 ; 5 ; 4 ; 11 ; 2 ; 8 ; 0 ; 3 ; 3 ; 9 ; 1 ; 4 ; 6 ; 4 ; 1 ; 8 ; 0 ; 1 ; 2 ; 5 ; 2 ; 3 ; 8 ; 2 ; 6 ; 6 ; 8 ; 4 ; 1 ; 5 ; 1 ; 7 ; 4 ; 7 ; 0 ; 1 ; 6 ; 5 ; 2 ; 2 ; 2 ; 1 ; 4 ; 0 ; 2 ; 1 ; 4 ; 5 ; 4 ; 0 ; 3 ; 0 ; 2 ; 5 ; 2 ; 5 ; 7 ; 4 ; 13 ; 4 ; 1 ; 5 ; 3 ; 4 ; 2 ; 12 ; 0 ; 0 ; 1 ; 3 ; 3 ; 2 ; 4 ; 3 ; 2 ; 1 ; 7 ; 6 ; 7 ; 4 ; 6 ; 5 ; 4 ; 3 ; 6 ; 3 ; 4 ; 4 ; 10 ; 2 ; 0 ; 5 ; 10 ; 1 ; 7 ; 4 ; 5 ; 10 ; 9 ; 1 ; 2 ; 0 ; 1"
oppScore <- "7 ; 6 ; 3 ; 3 ; 6 ; 7 ; 8 ; 6 ; 7 ; 3 ; 3 ; 6 ; 7 ; 8 ; 1 ; 1 ; 6 ; 10 ; 14 ; 3 ; 6 ; 1 ; 3 ; 2 ; 2 ; 4 ; 12 ; 0 ; 1 ; 4 ; 4 ; 9 ; 2 ; 2 ; 5 ; 3 ; 2 ; 5 ; 6 ; 4 ; 0 ; 5 ; 9 ; 2 ; 3 ; 6 ; 7 ; 8 ; 2 ; 4 ; 6 ; 4 ; 3 ; 6 ; 1 ; 9 ; 2 ; 0 ; 5 ; 7 ; 16 ; 4 ; 0 ; 3 ; 3 ; 4 ; 3 ; 3 ; 7 ; 5 ; 14 ; 5 ; 11 ; 4 ; 0 ; 4 ; 9 ; 1 ; 6 ; 6 ; 4 ; 6 ; 9 ; 0 ; 16 ; 3 ; 7 ; 1 ; 1 ; 1 ; 5 ; 2 ; 2 ; 5 ; 3 ; 5 ; 11 ; 9 ; 0 ; 5 ; 1 ; 7 ; 0 ; 3 ; 5 ; 9 ; 1 ; 7 ; 10 ; 7 ; 6 ; 8 ; 3 ; 4 ; 4 ; 3 ; 5 ; 5 ; 5 ; 9 ; 2 ; 4 ; 4 ; 3 ; 0 ; 6 ; 4 ; 1 ; 8 ; 3 ; 9 ; 5 ; 4 ; 9 ; 2 ; 3 ; 7 ; 3 ; 4 ; 2 ; 1 ; 6 ; 4 ; 15 ; 3 ; 0 ; 5 ; 5 ; 4 ; 10 ; 7 ; 11 ; 6 ; 5 ; 9 ; 3 ; 4 ; 8 ; 6 ; 9 ; 7 ; 6 ; 2 ; 5 ; 4 ; 1 ; 18 ; 6 ; 6 ; 15 ; 6 ; 5 ; 4 ; 6 ; 8 ; 9 ; 8 ; 5 ; 1 ; 1 ; 1 ; 0 ; 7 ; 2 ; 4 ; 4 ; 3 ; 4 ; 7 ; 8 ; 2 ; 0 ; 7 ; 4 ; 4 ; 5 ; 5 ; 5 ; 4 ; 4 ; 4 ; 9 ; 1 ; 4 ; 10 ; 6 ; 5 ; 7 ; 1 ; 7 ; 1 ; 1 ; 6 ; 7 ; 15 ; 3 ; 1 ; 7 ; 5 ; 6 ; 6 ; 4 ; 2 ; 6 ; 10 ; 5 ; 7 ; 14 ; 3 ; 10 ; 6 ; 1 ; 7 ; 9 ; 4 ; 3 ; 5 ; 2 ; 4 ; 9 ; 1 ; 5 ; 4 ; 1 ; 8 ; 3 ; 1 ; 0 ; 2 ; 3 ; 4 ; 5 ; 6 ; 13 ; 5 ; 2 ; 3 ; 4 ; 1 ; 5 ; 6 ; 15 ; 5 ; 0 ; 3 ; 12 ; 12 ; 1 ; 4 ; 5 ; 3 ; 3 ; 5 ; 3 ; 3 ; 9 ; 2 ; 5 ; 1 ; 1 ; 6 ; 4 ; 3 ; 4 ; 5 ; 6 ; 4 ; 1 ; 1 ; 2 ; 2 ; 5 ; 7 ; 3 ; 2 ; 5 ; 2 ; 2 ; 4 ; 7 ; 7 ; 2 ; 0 ; 10 ; 1 ; 9 ; 2 ; 3 ; 3 ; 3 ; 2 ; 6 ; 3 ; 1 ; 4 ; 4 ; 1 ; 2 ; 3 ; 5 ; 1 ; 3 ; 4 ; 2 ; 6 ; 7 ; 4 ; 1 ; 10 ; 2 ; 8 ; 2 ; 5 ; 7 ; 9 ; 1 ; 14 ; 4 ; 2 ; 6 ; 1 ; 3 ; 3 ; 3 ; 3 ; 1 ; 6 ; 6 ; 7 ; 6 ; 7 ; 0 ; 3 ; 2 ; 1 ; 0 ; 2 ; 3 ; 3 ; 3 ; 0 ; 1 ; 1 ; 2 ; 2 ; 16 ; 2 ; 7 ; 7 ; 4 ; 8 ; 4 ; 3 ; 4 ; 1 ; 0 ; 14 ; 4 ; 6 ; 3 ; 6 ; 8 ; 4 ; 5 ; 7 ; 8 ; 4 ; 4 ; 8 ; 2 ; 5 ; 7 ; 8 ; 8 ; 3 ; 3 ; 4 ; 4 ; 10 ; 6 ; 3 ; 1 ; 3 ; 4 ; 9 ; 4 ; 7 ; 10 ; 2 ; 4 ; 8 ; 1 ; 5 ; 11 ; 1 ; 3 ; 4 ; 8 ; 5 ; 0 ; 5 ; 3 ; 2 ; 3 ; 12 ; 3 ; 1 ; 0 ; 3 ; 2 ; 8 ; 2 ; 2 ; 4 ; 13 ; 5 ; 13 ; 4 ; 4 ; 6 ; 1 ; 8 ; 12 ; 1 ; 4 ; 3 ; 3 ; 5 ; 3 ; 8 ; 1 ; 5 ; 1 ; 10 ; 9 ; 9 ; 2 ; 5 ; 7 ; 4 ; 1 ; 10 ; 10 ; 8 ; 1 ; 4 ; 1 ; 2 ; 6 ; 8 ; 3 ; 3 ; 13 ; 5 ; 2 ; 2 ; 4 ; 5 ; 4 ; 7 ; 5 ; 6 ; 4 ; 0 ; 0 ; 0 ; 4 ; 3 ; 6 ; 5 ; 3 ; 8 ; 12 ; 1 ; 0 ; 5 ; 12 ; 3 ; 2 ; 7 ; 5 ; 11 ; 6 ; 5 ; 0 ; 3 ; 1 ; 0 ; 3 ; 0 ; 2 ; 4 ; 1 ; 2 ; 11 ; 8 ; 2 ; 8 ; 11 ; 5 ; 2 ; 1 ; 6 ; 3 ; 6 ; 3 ; 9 ; 2 ; 1 ; 5 ; 6 ; 6 ; 1 ; 5 ; 4 ; 3 ; 2 ; 3 ; 3 ; 5 ; 7 ; 1 ; 5 ; 1 ; 6 ; 10 ; 1 ; 7 ; 1 ; 3 ; 5 ; 5 ; 6 ; 4 ; 5 ; 4 ; 3 ; 1 ; 6 ; 1 ; 8 ; 3 ; 4 ; 1 ; 5 ; 5 ; 8 ; 9 ; 12 ; 5 ; 3 ; 8 ; 1 ; 5 ; 3 ; 2 ; 3 ; 0 ; 0 ; 4 ; 5 ; 2 ; 7 ; 9 ; 4 ; 0 ; 5 ; 3 ; 2 ; 2 ; 1 ; 3 ; 3 ; 3 ; 4 ; 6 ; 3 ; 1 ; 4 ; 1 ; 4 ; 0 ; 2 ; 3 ; 6 ; 2 ; 5 ; 2 ; 2 ; 5 ; 4 ; 1 ; 9 ; 5 ; 0 ; 10 ; 6 ; 0 ; 3 ; 2 ; 3 ; 6 ; 3 ; 5 ; 4 ; 5 ; 5 ; 3 ; 1 ; 9 ; 1 ; 4 ; 5 ; 2 ; 0 ; 1 ; 0 ; 11 ; 7 ; 7 ; 6 ; 9 ; 9 ; 6 ; 4 ; 6 ; 7 ; 4 ; 3 ; 10 ; 13 ; 2 ; 7 ; 3 ; 5 ; 2 ; 6 ; 3 ; 3 ; 0 ; 4 ; 5 ; 6 ; 4 ; 2 ; 3 ; 6 ; 5 ; 1 ; 6 ; 4 ; 5 ; 2 ; 4 ; 5 ; 4 ; 1 ; 2 ; 3 ; 3 ; 4 ; 1 ; 0 ; 3 ; 1 ; 6 ; 3 ; 3 ; 1 ; 5 ; 3 ; 9 ; 1 ; 5 ; 10 ; 6 ; 2 ; 5 ; 2 ; 5 ; 1 ; 7 ; 5 ; 3 ; 6 ; 1 ; 4 ; 6 ; 10 ; 5 ; 20 ; 7 ; 6 ; 4 ; 3 ; 2 ; 5 ; 2 ; 5 ; 2 ; 5 ; 13 ; 7 ; 9 ; 4 ; 6 ; 10 ; 4 ; 14 ; 2 ; 4 ; 4 ; 4 ; 5 ; 0 ; 2 ; 3 ; 3 ; 9 ; 1 ; 1 ; 7 ; 5 ; 4 ; 5 ; 2 ; 3 ; 2 ; 5 ; 1 ; 6 ; 3 ; 2 ; 4 ; 2 ; 4 ; 1 ; 0 ; 8 ; 8 ; 1 ; 5 ; 2 ; 4 ; 6 ; 4 ; 6 ; 10 ; 7 ; 2 ; 9 ; 3 ; 11 ; 8 ; 4 ; 7 ; 2 ; 3 ; 3 ; 6 ; 3 ; 0 ; 2 ; 10 ; 5 ; 5 ; 9 ; 3 ; 4 ; 2 ; 4 ; 2 ; 0 ; 3 ; 1 ; 2 ; 2 ; 1 ; 8 ; 8 ; 9 ; 4 ; 0 ; 3 ; 4 ; 8 ; 5 ; 3 ; 6 ; 7 ; 2 ; 2 ; 3 ; 4 ; 2 ; 7 ; 3 ; 2 ; 4 ; 1 ; 1 ; 6 ; 7 ; 8 ; 3 ; 2 ; 8 ; 4 ; 4 ; 1 ; 6 ; 8 ; 6 ; 3 ; 3 ; 5 ; 7 ; 6 ; 8 ; 3 ; 4 ; 0 ; 6 ; 4 ; 4 ; 2 ; 6 ; 12 ; 8 ; 4 ; 6 ; 1 ; 5 ; 3 ; 3 ; 0 ; 1 ; 7 ; 6 ; 8 ; 6 ; 3 ; 2 ; 3 ; 1 ; 5 ; 2 ; 5 ; 1 ; 2 ; 4 ; 3 ; 7 ; 5 ; 4 ; 7 ; 0 ; 4 ; 4 ; 5 ; 5 ; 3 ; 2 ; 7 ; 4 ; 4 ; 9 ; 3 ; 3 ; 7 ; 2 ; 0 ; 4 ; 2 ; 5 ; 4 ; 14 ; 0 ; 7 ; 2 ; 5 ; 5 ; 18 ; 7 ; 7 ; 3 ; 4 ; 9 ; 0 ; 1 ; 2 ; 2 ; 5 ; 7 ; 2 ; 6 ; 1 ; 7 ; 8 ; 4 ; 1 ; 5 ; 6 ; 5 ; 2 ; 3 ; 7 ; 2 ; 3 ; 4 ; 3 ; 1 ; 3 ; 11 ; 6 ; 1 ; 3 ; 11 ; 7 ; 8 ; 4 ; 5 ; 13 ; 1 ; 2 ; 2 ; 7 ; 2 ; 5 ; 14 ; 4 ; 5 ; 0 ; 4 ; 1 ; 5 ; 8 ; 4 ; 0 ; 2 ; 6 ; 1 ; 6 ; 6 ; 3 ; 1 ; 4 ; 5 ; 4 ; 8 ; 2 ; 3"
rsHome <- "1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0"
rsSeas1 <- "2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015"
rsSeas2 <- "2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2010 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2011 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2012 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2013 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2014 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015 ; 2015"
rsSeas <- paste(rsSeas1, rsSeas2, sep=" ; ")
redsox <- data.frame(date=strsplit(rsDate, " ; ")[[1]],
boston_score=as.integer(strsplit(rsScore, " ; ")[[1]]),
opponent_score=as.integer(strsplit(oppScore, " ; ")[[1]]),
homegame=as.numeric(strsplit(rsHome, " ; ")[[1]]),
mlb=1, nfl=0, nhl=0, nba=0,
season=as.numeric(strsplit(rsSeas, " ; ")[[1]]),
stringsAsFactors=FALSE
)
# View summary information about your redsox data
summary(redsox)
## date boston_score opponent_score homegame
## Length:972 Min. : 0.000 Min. : 0.000 Min. :0.0
## Class :character 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.:0.0
## Mode :character Median : 4.000 Median : 4.000 Median :0.5
## Mean : 4.796 Mean : 4.538 Mean :0.5
## 3rd Qu.: 7.000 3rd Qu.: 6.000 3rd Qu.:1.0
## Max. :22.000 Max. :20.000 Max. :1.0
## mlb nfl nhl nba season
## Min. :1 Min. :0 Min. :0 Min. :0 Min. :2010
## 1st Qu.:1 1st Qu.:0 1st Qu.:0 1st Qu.:0 1st Qu.:2011
## Median :1 Median :0 Median :0 Median :0 Median :2012
## Mean :1 Mean :0 Mean :0 Mean :0 Mean :2012
## 3rd Qu.:1 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0 3rd Qu.:2014
## Max. :1 Max. :0 Max. :0 Max. :0 Max. :2015
# Convert the date column to a time-based format
redsox$date<- as.Date(redsox$date)
# Convert your red sox data to xts
redsox_xts <- as.xts(redsox[,-1], order.by = redsox$date)
# Plot the Red Sox score and the opponent score over time
plot.zoo(redsox_xts[, c("boston_score", "opponent_score")])
# Generate a new variable coding for red sox wins
redsox_xts$win_loss <- ifelse(redsox_xts$boston_score > redsox_xts$opponent_score, 1, 0)
# Identify the date of the last game each season
close <- endpoints(redsox_xts, on = "years")
# Calculate average win/loss record at the end of each season
period.apply(redsox_xts[, "win_loss"], INDEX=close, FUN=mean)
## win_loss
## 2010-10-03 0.5493827
## 2011-09-28 0.5555556
## 2012-10-03 0.4259259
## 2013-09-29 0.5987654
## 2014-09-28 0.4382716
## 2015-10-04 0.4814815
# Split redsox_xts win_loss data into years
redsox_seasons <- split(redsox_xts$win_loss, f = "years")
# Use lapply to calculate the cumulative mean for each season
redsox_ytd <- lapply(redsox_seasons, cummean)
# Use do.call to rbind the results
redsox_winloss <- do.call(rbind, redsox_ytd)
# Plot the win_loss average for the 2013 season
plot.xts( as.xts(as.vector(t(redsox_winloss)), order.by=index(redsox_xts))["2013"], ylim = c(0, 1))
# Select only the 2013 season
redsox_2013 <- redsox_xts["2013"]
# Use rollapply to generate the last ten average
lastten_2013 <- rollapply(redsox_2013$win_loss, width = 10, FUN = mean)
# Plot the last ten average during the 2013 season
plot.xts(lastten_2013, ylim = c(0, 1))
### *** dataset "sports" is not available; need to comment out
# Extract the day of the week of each observation
# weekday <- .indexwday(sports)
# head(weekday)
# Generate an index of weekend dates
# weekend <- which(.indexwday(sports) == 0 | .indexwday(sports) == 6)
# Subset only weekend games
# weekend_games <- sports[weekend]
# head(weekend_games)
# Generate a subset of sports data with only homegames
# homegames <- sports[sports$homegame == 1]
# Calculate the win/loss average of the last 20 home games
# homegames$win_loss_20 <- rollapply(homegames$win_loss, width = 20, FUN = mean)
# Calculate the win/loss average of the last 100 home games
# homegames$win_loss_100 <- rollapply(homegames$win_loss, width = 100, FUN = mean)
# Use plot.xts to generate
# plot.zoo(homegames[, c("win_loss_20", "win_loss_100")], plot.type = "single", lty = lty, lwd = lwd)
Chapter 1 - Exploring pitch velocities
Zach Greinke 2015 season - the dominant month of July 2015:
Subsets and histograms - the start_speed is the MPH numeric for the pitch when it leaves the pitcher’s hand:
Using tapply() for comparisons:
Example code includes:
# Problem - I do not have the greinke dataset!
# Print the first 6 rows of the data
# head(greinke)
# Print the number of rows in the data frame
# nrow(greinke)
# Summarize the start_speed variable
# summary(greinke$start_speed)
# Get rid of data without start_speed
# greinke <- subset(greinke, !is.na(start_speed))
# Print the number of complete entries
# nrow(greinke)
# Print the structure of greinke
# str(greinke)
# Check if dates are formatted as dates
# class(greinke$game_date)
# Change them to dates
# greinke$game_date <- as.Date(greinke$game_date, format="%m/%d/%Y")
# Check that the variable is now formatted as a date
# class(greinke$game_date)
# Separate game_date into "year", "month", and "day"
# greinke <- separate(data = greinke, col = game_date,
# into = c("year", "month", "day"),
# sep = "-", remove = FALSE)
# Convert month to numeric
# greinke$month <- as.numeric(greinke$month)
# Create the july variable
# greinke$july <- ifelse(greinke$month == 7, "july", "other")
# View the head() of greinke
# head(greinke)
# Print a summary of the july variable
# summary(factor(greinke$july))
# Make a histogram of Greinke's start speed
# hist(greinke$start_speed)
# Create greinke_july
# greinke_july <- subset(greinke, july == "july")
# Create greinke_other
# greinke_other <- subset(greinke, july == "other")
# Use par to format your plot layout
# par(mfrow = c(1, 2))
# Plot start_speed histogram from july
# hist(greinke_july$start_speed)
# Plot start_speed histogram for other months
# hist(greinke_other$start_speed)
# Create july_ff
# july_ff <- subset(greinke_july, pitch_type == "FF")
# Create other_ff
# other_ff <- subset(greinke_other, pitch_type == "FF")
# Formatting code, don't change this
# par(mfrow = c(1, 2))
# Plot histogram of July fastball speeds
# hist(july_ff$start_speed)
# Plot histogram of other month fastball speeds
# hist(other_ff$start_speed)
# Make a fastball speed histogram for other months
# hist(other_ff$start_speed,
# col = "#00009950", freq = FALSE,
# ylim = c(0, .35), xlab = "Velocity (mph)",
# main = "Greinke 4-Seam Fastball Velocity")
# Add a histogram for July
# hist(july_ff$start_speed,
# col = "#99000050", freq = FALSE,
# add=TRUE)
# Draw vertical line at the mean of other_ff
# abline(v=mean(other_ff$start_speed), col="#00009950", lwd=2)
# Draw vertical line at the mean of july_ff
# abline(v=mean(july_ff$start_speed), col="#99000050", lwd=2)
# Summarize velocity in July and other months
# tapply(greinke$start_speed, greinke$july, FUN=mean)
# Create greinke_ff
# greinke_ff <- subset(greinke, pitch_type == "FF")
# Calculate mean fastball velocities: ff_velo_month
# ff_velo_month <- tapply(greinke_ff$start_speed, greinke_ff$july, FUN=mean)
# Print ff_velo_month
# ff_velo_month
# Create ff_dt
# ff_dt <- data.frame(tapply(greinke_ff$start_speed, greinke_ff$game_date, FUN=mean))
# Print the first 6 rows of ff_dt
# head(ff_dt)
# Create game_date in ff_dt
# ff_dt$game_date <- as.Date(row.names(ff_dt), format="%Y-%m-%d")
# Rename the first column
# colnames(ff_dt)[1] <- "start_speed"
# Remove row names
# row.names(ff_dt) <- NULL
# View head of ff_dt
# head(ff_dt)
# Plot game-by-game 4-seam fastballs
# plot(ff_dt$start_speed ~ ff_dt$game_date,
# lwd = 4, type = "l", ylim = c(88, 95),
# main = "Greinke 4-Seam Fastball Velocity",
# xlab = "Date", ylab = "Velocity (mph)"
# )
# Code from last exercise, don't change this
# plot(ff_dt$start_speed ~ ff_dt$game_date,
# lwd = 4, type = "l", ylim = c(88, 95),
# main = "Greinke 4-Seam Fastball Velocity",
# xlab = "Date", ylab = "Velocity (mph)")
# Add jittered points to the plot
# points(greinke_ff$start_speed ~ jitter(as.numeric(greinke_ff$game_date)),
# pch=16, col = "#99004450"
# )
cat("\n\nCould not run - do not have dataset 'greinke' or anything that would serve as an analog\n\n")
##
##
## Could not run - do not have dataset 'greinke' or anything that would serve as an analog
Chapter 2 - Exploring pitch types
Pitch mix - did the pitch mix change in July:
Ball-strike count and pitch usage:
Example code includes:
# DO NOT HAVE THE FULL DATA
# Subset the data to remove pitch types "IN" and "EP"
# greinke <- subset(greinke, pitch_type != "IN" & pitch_type != "EP")
# Drop the levels from pitch_type
# droplevels(greinke$pitch_type)
# Create type_tab
# type_tab <- table(greinke$pitch_type, greinke$july)
# Print type_tab
# type_tab
# Create type_tab
myFreq <- c(112, 51, 207, 66, 86, 487, 242, 1191, 255, 535)
myType <- rep(rep(c("CH", "CU", "FF", "FT", "SL"), times=2), times=myFreq)
myJuly <- rep(rep(c("july", "other"), each=5), times=myFreq)
type_tab <- table(myType, myJuly)
type_tab
## myJuly
## myType july other
## CH 112 487
## CU 51 242
## FF 207 1191
## FT 66 255
## SL 86 535
# Create type_prop table
type_prop <- round(prop.table(type_tab, margin=2), 3)
# Print type_prop
type_prop
## myJuly
## myType july other
## CH 0.215 0.180
## CU 0.098 0.089
## FF 0.397 0.439
## FT 0.126 0.094
## SL 0.165 0.197
# Create type_prop table
type_prop <- round(prop.table(type_tab, margin=2), 3)
# Print type_prop
type_prop
## myJuly
## myType july other
## CH 0.215 0.180
## CU 0.098 0.089
## FF 0.397 0.439
## FT 0.126 0.094
## SL 0.165 0.197
# Create ff_prop
ff_prop <- type_prop[row.names(type_prop) == "FF", ]
# Print ff_prop
ff_prop
## july other
## 0.397 0.439
# Print ff_velo_month
ff_velo_month <- data.frame(start_speed=c(92.4, 91.7), row.names=c("july", "other"))
ff_velo_month
## start_speed
## july 92.4
## other 91.7
# Change up the type_prop data
tProp <- type_prop
type_prop <- data.frame(Pitch=names(tProp[,1]), July=tProp[,1], Other=tProp[,2], row.names=NULL)
type_prop
## Pitch July Other
## 1 CH 0.215 0.180
## 2 CU 0.098 0.089
## 3 FF 0.397 0.439
## 4 FT 0.126 0.094
## 5 SL 0.165 0.197
# Create the Difference column
type_prop$Difference <- (type_prop$July - type_prop$Other)/type_prop$Other
# Print type_prop
type_prop
## Pitch July Other Difference
## 1 CH 0.215 0.180 0.19444444
## 2 CU 0.098 0.089 0.10112360
## 3 FF 0.397 0.439 -0.09567198
## 4 FT 0.126 0.094 0.34042553
## 5 SL 0.165 0.197 -0.16243655
# Plot a barplot
barplot(type_prop$Difference, names.arg = type_prop$Pitch,
main = "Pitch Usage in July vs. Other Months",
ylab = "Percentage Change in July",
ylim = c(-0.3, 0.3))
# Create bs_table
bsBalls <- rep(rep(0:3, times=3),
times=c(845, 307, 84, 19, 435, 371, 171, 50, 201, 310, 300, 139)
)
bsStrikes <- rep(rep(0:2, each=4),
times=c(845, 307, 84, 19, 435, 371, 171, 50, 201, 310, 300, 139)
)
bs_table <- table(bsBalls, bsStrikes)
bs_table
## bsStrikes
## bsBalls 0 1 2
## 0 845 435 201
## 1 307 371 310
## 2 84 171 300
## 3 19 50 139
# Create bs_table (this would be if the data were available - see above)
# bs_table <- table(greinke$balls, greinke$strikes)
# Create bs_prop_table
bs_prop_table <- round(prop.table(bs_table), 3)
# Print bs_prop_table
bs_prop_table
## bsStrikes
## bsBalls 0 1 2
## 0 0.261 0.135 0.062
## 1 0.095 0.115 0.096
## 2 0.026 0.053 0.093
## 3 0.006 0.015 0.043
# Print row sums
rowSums(bs_prop_table)
## 0 1 2 3
## 0.458 0.306 0.172 0.064
# Print column sums
colSums(bs_prop_table)
## 0 1 2
## 0.388 0.318 0.294
# DO NOT HAVE THIS DATA
# Create bs_count
# greinke$bs_count <- paste(greinke$balls, greinke$strikes, sep="-")
# Print the first 6 rows of greinke
# head(greinke)
# Create the bs_count_tab data file
bsFreq <- as.numeric(strsplit("136 ; 70 ; 29 ; 55 ; 64 ; 48 ; 15 ; 27 ; 45 ; 3 ; 8 ; 22 ; 709 ; 365 ; 172 ; 252 ; 307 ; 262 ; 69 ; 144 ; 255 ; 16 ; 42 ; 117", " ; ")[[1]])
bsCounts <- rep(strsplit("0-0 ; 0-1 ; 0-2 ; 1-0 ; 1-1 ; 1-2 ; 2-0 ; 2-1 ; 2-2 ; 3-0 ; 3-1 ; 3-2 ; 0-0 ; 0-1 ; 0-2 ; 1-0 ; 1-1 ; 1-2 ; 2-0 ; 2-1 ; 2-2 ; 3-0 ; 3-1 ; 3-2", " ; ")[[1]], times=bsFreq)
bsTypes <- rep(strsplit("july ; july ; july ; july ; july ; july ; july ; july ; july ; july ; july ; july ; other ; other ; other ; other ; other ; other ; other ; other ; other ; other ; other ; other", " ; ")[[1]], times=bsFreq)
bs_count_tab <- table(bsCounts, bsTypes)
bs_count_tab
## bsTypes
## bsCounts july other
## 0-0 136 709
## 0-1 70 365
## 0-2 29 172
## 1-0 55 252
## 1-1 64 307
## 1-2 48 262
## 2-0 15 69
## 2-1 27 144
## 2-2 45 255
## 3-0 3 16
## 3-1 8 42
## 3-2 22 117
# Create bs_count_tab (if raw data were actually available - see above)
# bs_count_tab <- table(greinke$bs_count, greinke$july)
# Create bs_month
bs_month <- round(prop.table(bs_count_tab, margin=2), 3)
# Print bs_month
bs_month
## bsTypes
## bsCounts july other
## 0-0 0.261 0.262
## 0-1 0.134 0.135
## 0-2 0.056 0.063
## 1-0 0.105 0.093
## 1-1 0.123 0.113
## 1-2 0.092 0.097
## 2-0 0.029 0.025
## 2-1 0.052 0.053
## 2-2 0.086 0.094
## 3-0 0.006 0.006
## 3-1 0.015 0.015
## 3-2 0.042 0.043
# Create diff_bs
diff_bs <- round((bs_month[, 2] - bs_month[, 1]) / bs_month[, 2], 3)
# Print diff_bs
diff_bs
## 0-0 0-1 0-2 1-0 1-1 1-2 2-0 2-1 2-2 3-0
## 0.004 0.007 0.111 -0.129 -0.088 0.052 -0.160 0.019 0.085 0.000
## 3-1 3-2
## 0.000 0.023
# Create a bar plot of the changes
barplot(diff_bs, main = "Ball-Strike Count Rate in July vs. Other Months",
ylab = "Percentage Change in July", ylim = c(-0.15, 0.15), las = 2)
# Create type_bs
typeFreq <- as.numeric(strsplit("92 ; 124 ; 482 ; 54 ; 93 ; 93 ; 49 ; 167 ; 55 ; 71 ; 36 ; 10 ; 61 ; 19 ; 75 ; 70 ; 34 ; 136 ; 32 ; 35 ; 79 ; 38 ; 136 ; 50 ; 68 ; 62 ; 9 ; 89 ; 31 ; 119 ; 27 ; 4 ; 37 ; 11 ; 5 ; 46 ; 12 ; 71 ; 18 ; 24 ; 52 ; 9 ; 109 ; 34 ; 96 ; 0 ; 0 ; 17 ; 2 ; 0 ; 18 ; 0 ; 24 ; 3 ; 5 ; 24 ; 4 ; 69 ; 12 ; 30", " ; ")[[1]])
typeCount <- rep(rep(row.names(bs_count_tab), each=5), times=typeFreq)
typePitch <- rep(rep(row.names(type_tab), times=12), times=typeFreq)
type_bs <- table(typePitch, typeCount)
type_bs
## typeCount
## typePitch 0-0 0-1 0-2 1-0 1-1 1-2 2-0 2-1 2-2 3-0 3-1 3-2
## CH 92 93 36 70 79 62 27 46 52 0 18 24
## CU 124 49 10 34 38 9 4 12 9 0 0 4
## FF 482 167 61 136 136 89 37 71 109 17 24 69
## FT 54 55 19 32 50 31 11 18 34 2 3 12
## SL 93 71 75 35 68 119 5 24 96 0 5 30
# Create type_bs (if greinke data were available; see above)
# type_bs <- table(greinke$pitch_type, greinke$bs_count)
# Print type_bs
type_bs
## typeCount
## typePitch 0-0 0-1 0-2 1-0 1-1 1-2 2-0 2-1 2-2 3-0 3-1 3-2
## CH 92 93 36 70 79 62 27 46 52 0 18 24
## CU 124 49 10 34 38 9 4 12 9 0 0 4
## FF 482 167 61 136 136 89 37 71 109 17 24 69
## FT 54 55 19 32 50 31 11 18 34 2 3 12
## SL 93 71 75 35 68 119 5 24 96 0 5 30
# Create type_bs_prop
type_bs_prop <- round(prop.table(type_bs, margin=2), 3)
# Print type_bs_prop
type_bs_prop
## typeCount
## typePitch 0-0 0-1 0-2 1-0 1-1 1-2 2-0 2-1 2-2 3-0
## CH 0.109 0.214 0.179 0.228 0.213 0.200 0.321 0.269 0.173 0.000
## CU 0.147 0.113 0.050 0.111 0.102 0.029 0.048 0.070 0.030 0.000
## FF 0.570 0.384 0.303 0.443 0.367 0.287 0.440 0.415 0.363 0.895
## FT 0.064 0.126 0.095 0.104 0.135 0.100 0.131 0.105 0.113 0.105
## SL 0.110 0.163 0.373 0.114 0.183 0.384 0.060 0.140 0.320 0.000
## typeCount
## typePitch 3-1 3-2
## CH 0.360 0.173
## CU 0.000 0.029
## FF 0.480 0.496
## FT 0.060 0.086
## SL 0.100 0.216
# Create type_late
lateData <- rep(rep(0:1, each=5),
times=c(416, 201, 1036, 249, 431, 183, 92, 362, 72, 190)
)
pitchData <- rep(rep(row.names(type_tab), times=2),
times=c(416, 201, 1036, 249, 431, 183, 92, 362, 72, 190)
)
type_late <- table(pitchData, lateData)
type_late
## lateData
## pitchData 0 1
## CH 416 183
## CU 201 92
## FF 1036 362
## FT 249 72
## SL 431 190
# Create the late_in_game column (if had the greinke data; see above)
# greinke$late_in_game <- ifelse(greinke$inning > 5, 1, 0)
# Convert late_in_game (if had the greinke data; see above)
# greinke$late_in_game <- factor(greinke$late_in_game)
# Create type_late (if had the greinke data; see above)
# type_late <- table(greinke$pitch_type, greinke$late_in_game)
# Create type_late_prop
type_late_prop <- round(prop.table(type_late, margin=2), 3)
# Print type_late_prop
type_late_prop
## lateData
## pitchData 0 1
## CH 0.178 0.204
## CU 0.086 0.102
## FF 0.444 0.403
## FT 0.107 0.080
## SL 0.185 0.211
# Create t_type_late
t_type_late <- t(type_late_prop)
# Print dimensions of t_type_late
dim(t_type_late)
## [1] 2 5
# Print dimensions of type_late
dim(type_late_prop)
## [1] 5 2
# Change row names
rownames(t_type_late) <- c("Early", "Late")
# Make barplot using t_type_late
barplot(t_type_late, beside = TRUE, col = c("red", "blue"),
main = "Early vs. Late In Game Pitch Selection",
ylab = "Pitch Selection Proportion",
legend = rownames(t_type_late))
Chapter 3 - Exploring pitch locations
Pitch location and Greinke’s July - pitches lower and further from the plate are harder to hit, but pitches repeatedly in the same location are easier to hit:
For loop for plots - execute the code across all the zones:
Example code includes:
# DO NOT HAVE THIS DATA
# Calculate average pitch height in inches in July vs. other months
# tapply(greinke$pz, greinke$july, mean) * 12
# Create greinke_lhb
# greinke_lhb <- subset(greinke, batter_stand == "L")
# Create greinke_rhb
# greinke_rhb <- subset(greinke, batter_stand == "R")
# Compute average px location for LHB
# tapply(greinke_lhb$px, greinke_lhb$july, mean) * 12
# Compute average px location for RHB
# tapply(greinke_rhb$px, greinke_rhb$july, mean) * 12
# Plot location of all pitches
# plot(greinke$pz ~ greinke$px,
# col = factor(greinke$july),
# xlim = c(-3, 3))
# Formatting code, don't change this
# par(mfrow = c(1, 2))
# Plot the pitch loctions for July
# plot(pz ~ px, data = greinke_july,
# col = "red", pch = 16,
# xlim = c(-3, 3), ylim = c(-1, 6),
# main = "July")
# Plot the pitch locations for other months
# plot(pz ~ px, data = greinke_other,
# col = "black", pch = 16,
# xlim = c(-3, 3), ylim = c(-1, 6),
# main = "Other months")
# Create greinke_sub
# greinke_sub <- subset(greinke, px > -2 & px < 2 & pz > 0 & pz < 5)
# Plot pitch location window
# plot(x = c(-2, 2), y = c(0, 5), type = "n",
# main = "Greinke Locational Zone Proportions",
# xlab = "Horizontal Location (ft.; Catcher's View)",
# ylab = "Vertical Location (ft.)")
# Add the grid lines
# grid(lty = "solid", col = "black")
# Create greinke_table
# greinke_table <- table(greinke_sub$zone)
# Create zone_prop
# zone_prop <- round(prop.table(greinke_table), 3)
# Plot strike zone grid, don't change this
# plot_grid()
# Add text from zone_prop[1]
# text(zone_prop[1], x=-1.5, y=4.5, cex=1.5)
# Plot grid, don't change this
# plot_grid()
# Plot text using for loop
# for(i in 1:20) {
# text(mean(greinke_sub$zone_px[greinke_sub$zone == i]),
# mean(greinke_sub$zone_pz[greinke_sub$zone == i]),
# zone_prop[i], cex = 1.5)
# }
# Create zone_prop_july
# zone_prop_july <- round(
# table(greinke_sub$zone[greinke_sub$july == "july"]) /
# nrow(subset(greinke_sub, july == "july")), 3)
# Create zone_prop_other
# zone_prop_other <- round(
# table(greinke_sub$zone[greinke_sub$july == "other"]) /
# nrow(subset(greinke_sub, july == "other")), 3)
# Print zone_prop_july
# zone_prop_july
# Print zone_prop_other
# zone_prop_other
# Fix zone_prop_july vector, don't change this
# zone_prop_july2 <- c(zone_prop_july[1:3], 0.00, zone_prop_july[4:19])
# names(zone_prop_july2) <- c(1:20)
# Create zone_prop_diff
# zone_prop_diff <- zone_prop_july2 - zone_prop_other
# Print zone_prop_diff
# zone_prop_diff
# Plot grid, don't change this
# plot_grid()
# Create for loop
# for(i in 1:20) {
# text(mean(greinke_sub$zone_px[greinke_sub$zone == i]),
# mean(greinke_sub$zone_pz[greinke_sub$zone == i]),
# zone_prop_diff[i, ], cex = 1.5)
# }
# Create greinke_zone_tab
# greinke_zone_tab <- table(greinke_sub$zone, greinke_sub$bs_count)
# Create zone_count_prop
# zone_count_prop <- round(prop.table(greinke_zone_tab, margin=2), 3)
# Print zone_count_prop
# zone_count_prop
# Create zone_count_diff
# zone_count_diff <- zone_count_prop[, 3] - zone_count_prop[, 10]
# Print the table
# zone_count_diff
# Plot grid, don't change this
# plot(x = c(-2, 2), y = c(0, 5), type = "n",
# main = "Greinke Locational Zone (0-2 vs. 3-0 Counts)",
# xlab = "Horizontal Location (ft.; Catcher's View)",
# ylab = "Vertical Location (ft.)")
# grid(lty = "solid", col = "black")
# Add text to the figure for location differences
# for(i in 1:20) {
# text(mean(greinke_sub$zone_px[greinke_sub$zone == i]),
# mean(greinke_sub$zone_pz[greinke_sub$zone == i]),
# zone_count_diff[i, ], cex = 1.5)
# }
cat("\n\nDo not have the data to run the associated code\n\n")
##
##
## Do not have the data to run the associated code
Chapter 4 - Exploring batted ball outcomes
Batted ball outcomes - contact rates:
Using ggplot2 - reduce the labor to produce certain types of graphics:
Batted ball outcomes - exit velocity:
Example code includes:
# DO NOT HAVE THIS DATA . . .
# Create batter_swing
# no_swing <- c("Ball", "Called Strike", "Ball in Dirt", "Hit By Pitch")
# greinke_ff$batter_swing <- ifelse(greinke_ff$pitch_result %in% no_swing, 0, 1)
# Create swing_ff
# swing_ff <- subset(greinke_ff, batter_swing == 1)
# Create the contact variable
# no_contact <- c("Swinging Strike", "Missed Bunt")
# swing_ff$contact <- ifelse(swing_ff$pitch_result %in% no_contact, 0, 1)
# Create velo_bin: add one line for "Fast"
# swing_ff$velo_bin <- ifelse(swing_ff$start_speed < 90.5, "Slow", NA)
# swing_ff$velo_bin <- ifelse(swing_ff$start_speed >= 90.5 & swing_ff$start_speed < 92.5,
# "Medium", swing_ff$velo_bin)
#
# swing_ff$velo_bin <- ifelse(swing_ff$start_speed >= 92.5,
# "Fast", swing_ff$velo_bin)
# Aggregate contact rate by velocity bin
# tapply(swing_ff$contact, swing_ff$velo_bin, FUN=mean)
#
#
# bin_pitch_speed <- function(start_speed) {
# as.integer(cut(start_speed, quantile(start_speed, probs = 0:3 / 3), include.lowest = TRUE))
# }
#
#
# Create the subsets for each pitch type
# swing_ff <- subset(swings, pitch_type == "FF")
# swing_ch <- subset(swings, pitch_type == "CH")
# swing_cu <- subset(swings, pitch_type == "CU")
# swing_ft <- subset(swings, pitch_type == "FT")
# swing_sl <- subset(swings, pitch_type == "SL")
# Make velo_bin_pitch variable for each subset
# swing_ff$velo_bin <- bin_pitch_speed(swing_ff$start_speed)
# swing_ch$velo_bin <- bin_pitch_speed(swing_ch$start_speed)
# swing_cu$velo_bin <- bin_pitch_speed(swing_cu$start_speed)
# swing_ft$velo_bin <- bin_pitch_speed(swing_ft$start_speed)
# swing_sl$velo_bin <- bin_pitch_speed(swing_sl$start_speed)
# Print quantile levels for each pitch
# thirds <- c(0, 1/3, 2/3, 1)
# quantile(swing_ff$start_speed, probs = thirds)
# quantile(swing_ch$start_speed, probs = thirds)
# quantile(swing_cu$start_speed, probs = thirds)
# quantile(swing_ft$start_speed, probs = thirds)
# quantile(swing_sl$start_speed, probs = thirds)
# Calculate contact rate by velocity for swing_ff
# tapply(swing_ff$contact, swing_ff$velo_bin, FUN=mean)
# Calculate contact rate by velocity for swing_ft
# tapply(swing_ft$contact, swing_ft$velo_bin, FUN=mean)
# Calculate contact rate by velocity for swing_ch
# tapply(swing_ch$contact, swing_ch$velo_bin, FUN=mean)
# Calculate contact rate by velocity for swing_cu
# tapply(swing_cu$contact, swing_cu$velo_bin, FUN=mean)
# Calculate contact rate by velocity for swing_sl
# tapply(swing_sl$contact, swing_sl$velo_bin, FUN=mean)
# Create swings_str2
# swings_str2 <- subset(swings, strikes == 2)
# Print number of observations
# nrow(swings_str2)
# Print a table of pitch use
# table(swings_str2$pitch_type)
# Calculate contact rate by pitch type
# round(tapply(swings_str2$contact, swings_str2$pitch_type, FUN=mean), 3)
# Create subset of swings: swings_rhb
# swings_rhb <- subset(swings, batter_stand == "R")
# Create subset of swings: swings_lhb
# swings_lhb <- subset(swings, batter_stand == "L")
# Create zone_contact_r
# zone_contact_r <- round(tapply(swings_rhb$contact, swings_rhb$zone, FUN=mean), 3)
# Create zone_contact_l
# zone_contact_l <- round(tapply(swings_lhb$contact, swings_lhb$zone, FUN=mean), 3)
# Plot figure grid for RHB
# par(mfrow = c(1, 2))
# plot(x = c(-1, 1), y = c(1, 4), type = "n",
# main = "Contact Rate by Location (RHB)",
# xlab = "Horizontal Location (ft.; Catcher's View)",
# ylab = "Vertical Location (ft.)")
# abline(v = 0)
# abline(h = 2)
# abline(h = 3)
# Add text for RHB contact rate
# for(i in unique(c(6, 7, 10, 11, 14, 15))) {
# text(mean(swings_rhb$zone_px[swings_rhb$zone == i]),
# mean(swings_rhb$zone_pz[swings_rhb$zone == i]),
# zone_contact_r[rownames(zone_contact_r) == i], cex = 1.5)
# }
# Add LHB plot
# plot(x = c(-1, 1), y = c(1, 4), type = "n",
# main = "Contact Rate by Location (LHB)",
# xlab = "Horizontal Location (ft.; Catcher's View)",
# ylab = "Vertical Location (ft.)")
# abline(v = 0)
# abline(h = 2)
# abline(h = 3)
# Add text for LHB contact rate
# for(i in unique(c(6, 7, 10, 11, 14, 15))) {
# text(mean(swings_lhb$zone_px[swings_lhb$zone == i]),
# mean(swings_lhb$zone_pz[swings_lhb$zone == i]),
# zone_contact_l[rownames(zone_contact_l) == i], cex = 1.5)
# }
# Create vector px
# px <- rep(seq(-1.5, 1.5, by=1), times=5)
# Create vector pz
# pz <- rep(seq(4.5, 0.5, by=-1), each=4)
# Create vector of zone numbers
# zone <- seq(1, 20, by=1)
# Create locgrid
# locgrid <- data.frame(zone=zone, px=px, pz=pz)
# Print locgrid
# locgrid
# The gridExtra package is preloaded in your workspace
# Examine new contact data
# zone_contact_r
# zone_contact_l
# Merge locgrid with zone_contact_r
# locgrid <- merge(locgrid, zone_contact_r, by="zone", all.x=TRUE)
# Merge locgrid with zone_contact_l
# locgrid <- merge(locgrid, zone_contact_l, by="zone", all.x=TRUE)
# Print locgrid to the console
# locgrid
# Make base grid with ggplot()
# plot_base_grid <- ggplot(locgrid, aes(x=px, y=pz))
# Arrange the plots side-by-side
# grid.arrange(plot_base_grid, plot_base_grid, ncol=2)
# Make RHB plot
# plot_titles_rhb <- plot_base_grid +
# ggtitle("RHB Contact Rates") +
# labs(x = "Horizontal Location(ft.; Catcher's View)",
# y = "Vertical Location (ft.)") +
# theme(plot.title = element_text(size = 15))
# Make LHB plot
# plot_titles_lhb <- plot_base_grid +
# ggtitle("LHB Contact Rates") +
# labs(x = "Horizontal Location(ft.; Catcher's View)",
# y = "Vertical Location (ft.)") +
# theme(plot.title = element_text(size = 15))
# Display both side-by-side
# grid.arrange(plot_titles_rhb, plot_titles_lhb, ncol=2)
# Make RHB plot
# plot_colors_rhb <- plot_titles_rhb +
# geom_tile(aes(fill = contact_rate_r)) +
# scale_fill_gradientn(name = "Contact Rate",
# limits = c(0.5, 1),
# breaks = seq(from = 0.5, to = 1, by = 0.1),
# colors = c(brewer.pal(n = 7, name = "Reds")))
# Make LHB plot
# plot_colors_lhb <- plot_titles_lhb +
# geom_tile(aes(fill = contact_rate_l)) +
# scale_fill_gradientn(name = "Contact Rate",
# limits = c(0.5, 1),
# breaks = seq(from = 0.5, to = 1, by = 0.1),
# colors = c(brewer.pal(n = 7, name = "Reds")))
# Display plots side-by-side
# grid.arrange(plot_colors_rhb, plot_colors_lhb, ncol=2)
# Make RHB plot
# plot_contact_rhb <- plot_colors_rhb +
# annotate("text", x = locgrid$px, y = locgrid$pz,
# label = locgrid$contact_rate_r, size = 5)
# Make LHB plot
# plot_contact_lhb <- plot_colors_lhb +
# annotate("text", x = locgrid$px, y = locgrid$pz,
# label = locgrid$contact_rate_l, size = 5)
# Plot them side-by-side
# grid.arrange(plot_contact_rhb, plot_contact_lhb, ncol=2)
# Create pcontact
# pcontact <- subset(swings, contact == 1 & !is.na(batted_ball_velocity))
# Create pcontact_r
# pcontact_r <- subset(pcontact, batter_stand == "R")
# Create pcontact_l
# pcontact_l <- subset(pcontact, batter_stand == "L")
# Create exit_speed_r
# exit_speed_r <- data.frame(tapply(pcontact_r$batted_ball_velocity,
# pcontact_r$zone, mean))
# exit_speed_r <- round(exit_speed_r, 1)
# colnames(exit_speed_r) <- "exit_speed_rhb"
# exit_speed_r$zone <- row.names(exit_speed_r)
# Create exit_speed_l
# exit_speed_l <- data.frame(tapply(pcontact_l$batted_ball_velocity,
# pcontact_l$zone, mean))
# exit_speed_l <- round(exit_speed_l, 1)
# colnames(exit_speed_l) <- "exit_speed_lhb"
# exit_speed_l$zone <- row.names(exit_speed_l)
# Merge with locgrid
# locgrid <- merge(locgrid, exit_speed_r, by = "zone", all.x = T)
# locgrid <- merge(locgrid, exit_speed_l, by = "zone", all.x = T)
# Print locgrid
# locgrid
# Create RHB exit speed plotting object
# plot_exit_rhb <- plot_base_grid +
# geom_tile(data = locgrid, aes(fill = exit_speed_rhb)) +
# scale_fill_gradientn(name = "Exit Speed (mph)",
# limits = c(60, 95),
# breaks = seq(from = 60, to = 95, by = 5),
# colors = c(brewer.pal(n = 7, name = "Reds"))) +
# annotate("text", x = locgrid$px, y = locgrid$pz,
# label = locgrid$exit_speed_rhb, size = 5) +
# ggtitle("RHB Exit Velocity (mph)") +
# labs(x = "Horizontal Location(ft.; Catcher's View)",
# y = "Vertical Location (ft.)") +
# theme(plot.title = element_text(size = 15))
# Create LHB exit speed plotting object
# plot_exit_lhb <- plot_base_grid +
# geom_tile(data = locgrid, aes(fill = exit_speed_lhb)) +
# scale_fill_gradientn(name = "Exit Speed (mph)",
# limits = c(60, 95),
# breaks = seq(from = 60, to = 95, by = 5),
# colors = c(brewer.pal(n = 7, name = "Reds"))) +
# annotate("text", x = locgrid$px, y = locgrid$pz,
# label = locgrid$exit_speed_lhb, size = 5) +
# ggtitle("LHB Exit Velocity (mph)") +
# labs(x = "Horizontal Location(ft.; Catcher's View)",
# y = "Vertical Location (ft.)") +
# theme(plot.title = element_text(size = 15))
# Plot each side-by-side
# grid.arrange(plot_exit_rhb, plot_exit_lhb, ncol=2)
# Examine head() and tail() of exit_tidy
# head(exit_tidy)
# tail(exit_tidy)
# Create plot_exit
# plot_exit <- plot_base_grid +
# geom_tile(data = exit_tidy, aes(fill = exit_speed)) +
# scale_fill_gradientn(name = "Exit Speed (mph)",
# colors = c(brewer.pal(n = 7, name = "Reds"))) +
# ggtitle("Exit Speed (mph)") +
# labs(x = "Horizontal Location(ft.; Catcher's View)",
# y = "Vertical Location (ft.)") +
# theme(plot.title = element_text(size = 15)) +
# facet_grid(. ~ batter_stand)
# Display plot_exit
# plot_exit
cat("\n\nDo not have the data to run the associated code\n\n")
##
##
## Do not have the data to run the associated code
Chapter 1 - Language of Data
Examining the “High School and Beyond” data frame - one observation per row, one variable per column:
Types of variables - take note of the dimensions first:
Categorical data in R - factors:
Discretize a variable - convert numerical variable to categorical variable:
Visualizing numerical data - good first step of any exploratory data analysis (picture is worth 1000 words):
Example code includes:
# Load data
data(email50, package="openintro")
# View its structure
str(email50)
## 'data.frame': 50 obs. of 21 variables:
## $ spam : num 0 0 1 0 0 0 0 0 0 0 ...
## $ to_multiple : num 0 0 0 0 0 0 0 0 0 0 ...
## $ from : num 1 1 1 1 1 1 1 1 1 1 ...
## $ cc : int 0 0 4 0 0 0 0 0 1 0 ...
## $ sent_email : num 1 0 0 0 0 0 0 1 1 0 ...
## $ time : POSIXct, format: "2012-01-04 07:19:16" "2012-02-16 14:10:06" ...
## $ image : num 0 0 0 0 0 0 0 0 0 0 ...
## $ attach : num 0 0 2 0 0 0 0 0 0 0 ...
## $ dollar : num 0 0 0 0 9 0 0 0 0 23 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ viagra : num 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num 0 0 0 0 1 0 0 0 0 0 ...
## $ num_char : num 21.705 7.011 0.631 2.454 41.623 ...
## $ line_breaks : int 551 183 28 61 1088 5 17 88 242 578 ...
## $ format : num 1 1 0 0 1 0 0 1 1 1 ...
## $ re_subj : num 1 0 0 0 0 0 0 1 1 0 ...
## $ exclaim_subj: num 0 0 0 0 0 0 0 0 1 0 ...
## $ urgent_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_mess: num 8 1 2 1 43 0 0 2 22 3 ...
## $ number : Factor w/ 3 levels "none","small",..: 2 3 1 2 2 2 2 2 2 2 ...
# Glimpse email50
glimpse(email50)
## Observations: 50
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0...
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ cc <int> 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ sent_email <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ time <dttm> 2012-01-04 07:19:16, 2012-02-16 14:10:06, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ attach <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0...
## $ dollar <dbl> 0, 0, 0, 0, 9, 0, 0, 0, 0, 23, 4, 0, 3, 2, 0, 0, ...
## $ winner <fctr> no, no, no, no, no, no, no, no, no, no, no, no, ...
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ password <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0...
## $ num_char <dbl> 21.705, 7.011, 0.631, 2.454, 41.623, 0.057, 0.809...
## $ line_breaks <int> 551, 183, 28, 61, 1088, 5, 17, 88, 242, 578, 1167...
## $ format <dbl> 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1...
## $ re_subj <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ exclaim_mess <dbl> 8, 1, 2, 1, 43, 0, 0, 2, 22, 3, 13, 1, 2, 2, 21, ...
## $ number <fctr> small, big, none, small, small, small, small, sm...
# Subset of emails with big numbers: email50_big
email50_big <- email50 %>%
filter(number == "big")
# Glimpse the subset
glimpse(email50_big)
## Observations: 7
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1
## $ cc <int> 0, 0, 0, 0, 0, 0, 0
## $ sent_email <dbl> 0, 0, 0, 0, 0, 1, 0
## $ time <dttm> 2012-02-16 14:10:06, 2012-02-04 17:26:09, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0
## $ attach <dbl> 0, 0, 0, 0, 0, 0, 0
## $ dollar <dbl> 0, 0, 3, 2, 0, 0, 0
## $ winner <fctr> no, no, yes, no, no, no, no
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0
## $ password <dbl> 0, 2, 0, 0, 0, 0, 8
## $ num_char <dbl> 7.011, 10.368, 42.793, 26.520, 6.563, 11.223, 10.613
## $ line_breaks <int> 183, 198, 712, 692, 140, 512, 225
## $ format <dbl> 1, 1, 1, 1, 1, 1, 1
## $ re_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_subj <dbl> 0, 0, 0, 1, 0, 0, 0
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_mess <dbl> 1, 1, 2, 7, 2, 9, 9
## $ number <fctr> big, big, big, big, big, big, big
# Table of number variable
table(email50_big$number)
##
## none small big
## 0 0 7
# Drop levels
email50_big$number <- droplevels(email50_big$number)
# Another table of number variable
table(email50_big$number)
##
## big
## 7
# Calculate median number of characters: med_num_char
# Note that wrapping in () also prints the variable
(med_num_char <- median(email50$num_char))
## [1] 6.8895
# Create num_char_cat variable in email50
email50 <- email50 %>%
mutate(num_char_cat = ifelse(num_char < med_num_char, "below median", "at or above median"))
# Count emails in each category
table(email50$num_char_cat)
##
## at or above median below median
## 25 25
# Create number_yn column in email50
email50 <- email50 %>%
mutate(number_yn = ifelse(number == "none", "no", "yes"))
# Visualize number_yn
ggplot(email50, aes(x = number_yn)) +
geom_bar()
# Scatterplot of exclaim_mess vs. num_char
ggplot(email50, aes(x = num_char, y = exclaim_mess, color = factor(spam))) +
geom_point()
Chapter 2 - Study Types and Cautions
Observational studies and experiments - study types, and scopes of inferences:
Random sampling and random assignment:
Simpson’s paradox - when a confounder interferes with understanding response (y) variables and exlanatory (x1, x2, etc.) variables:
Example code includes:
# Load data
data(gapminder, package="gapminder")
# Glimpse data
glimpse(gapminder)
## Observations: 1,704
## Variables: 6
## $ country <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan,...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi...
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992...
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.8...
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 1488...
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 78...
# Identify type of study
type_of_study <- "observational"
dfUCB <- as.data.frame(UCBAdmissions)
ucb_admit <- data.frame(Admit=factor(rep(dfUCB$Admit, times=dfUCB$Freq)),
Gender=factor(rep(dfUCB$Gender, times=dfUCB$Freq)),
Dept=as.character(rep(dfUCB$Dept, times=dfUCB$Freq)),
stringsAsFactors=FALSE
)
str(ucb_admit)
## 'data.frame': 4526 obs. of 3 variables:
## $ Admit : Factor w/ 2 levels "Admitted","Rejected": 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender: Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : chr "A" "A" "A" "A" ...
# Count number of male and female applicants admitted
ucb_counts <- ucb_admit %>%
count(Admit, Gender)
# View result
ucb_counts
## Source: local data frame [4 x 3]
## Groups: Admit [?]
##
## Admit Gender n
## <fctr> <fctr> <int>
## 1 Admitted Male 1198
## 2 Admitted Female 557
## 3 Rejected Male 1493
## 4 Rejected Female 1278
# Spread the output across columns
ucb_counts %>%
tidyr::spread(Admit, n)
## # A tibble: 2 × 3
## Gender Admitted Rejected
## * <fctr> <int> <int>
## 1 Male 1198 1493
## 2 Female 557 1278
ucb_admit %>%
# Table of counts of admission status and gender
count(Admit, Gender) %>%
# Spread output across columns based on admission status
tidyr::spread(Admit, n) %>%
# Create new variable
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## # A tibble: 2 × 4
## Gender Admitted Rejected Perc_Admit
## <fctr> <int> <int> <dbl>
## 1 Male 1198 1493 0.4451877
## 2 Female 557 1278 0.3035422
# Table of counts of admission status and gender for each department
admit_by_dept <- ucb_admit %>%
count(Dept, Gender, Admit) %>%
tidyr::spread(Admit, n)
# View result
admit_by_dept
## Source: local data frame [12 x 4]
## Groups: Dept, Gender [12]
##
## Dept Gender Admitted Rejected
## * <chr> <fctr> <int> <int>
## 1 A Male 512 313
## 2 A Female 89 19
## 3 B Male 353 207
## 4 B Female 17 8
## 5 C Male 120 205
## 6 C Female 202 391
## 7 D Male 138 279
## 8 D Female 131 244
## 9 E Male 53 138
## 10 E Female 94 299
## 11 F Male 22 351
## 12 F Female 24 317
# Percentage of males admitted for each department
admit_by_dept %>%
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## Source: local data frame [12 x 5]
## Groups: Dept, Gender [12]
##
## Dept Gender Admitted Rejected Perc_Admit
## <chr> <fctr> <int> <int> <dbl>
## 1 A Male 512 313 0.62060606
## 2 A Female 89 19 0.82407407
## 3 B Male 353 207 0.63035714
## 4 B Female 17 8 0.68000000
## 5 C Male 120 205 0.36923077
## 6 C Female 202 391 0.34064081
## 7 D Male 138 279 0.33093525
## 8 D Female 131 244 0.34933333
## 9 E Male 53 138 0.27748691
## 10 E Female 94 299 0.23918575
## 11 F Male 22 351 0.05898123
## 12 F Female 24 317 0.07038123
Chapter 3 - Sampling Strategies and Experimental Design
Sampling strategies:
Sampling in R:
Principles of experimental design:
Example code includes:
usrState <- "Connecticut ; Maine ; Massachusetts ; New Hampshire ; Rhode Island ; Vermont ; New Jersey ; New York ; Pennsylvania ; Illinois ; Indiana ; Michigan ; Ohio ; Wisconsin ; Iowa ; Kansas ; Minnesota ; Missouri ; Nebraska ; North Dakota ; South Dakota ; Delaware ; Florida ; Georgia ; Maryland ; North Carolina ; South Carolina ; Virginia ; District of Columbia ; West Virginia ; Alabama ; Kentucky ; Mississippi ; Tennessee ; Arkansas ; Louisiana ; Oklahoma ; Texas ; Arizona ; Colorado ; Idaho ; Montana ; Nevada ; New Mexico ; Utah ; Wyoming ; Alaska ; California ; Hawaii ; Oregon ; Washington"
usrRegion <- "Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West"
us_regions <- data.frame(state=factor(strsplit(usrState, " ; ")[[1]]),
region=factor(strsplit(usrRegion, " ; ")[[1]])
)
# Simple random sample: states_srs
states_srs <- us_regions %>%
dplyr::sample_n(size=8)
# Count states by region
states_srs %>%
group_by(region) %>%
count()
## # A tibble: 4 × 2
## region n
## <fctr> <int>
## 1 Midwest 3
## 2 Northeast 2
## 3 South 1
## 4 West 2
# Stratified sample
states_str <- us_regions %>%
group_by(region) %>%
dplyr::sample_n(size=2)
# Count states by region
states_str %>%
group_by(region) %>%
count()
## # A tibble: 4 × 2
## region n
## <fctr> <int>
## 1 Midwest 2
## 2 Northeast 2
## 3 South 2
## 4 West 2
Chapter 4 - Case Study
Data will be from a study titled “Beauty in the Classroom”:
Variables in the data:
Example code includes:
# NEED DATASET
evStudents <- "43 ; 125 ; 125 ; 123 ; 20 ; 40 ; 44 ; 55 ; 195 ; 46 ; 27 ; 25 ; 20 ; 25 ; 42 ; 20 ; 18 ; 48 ; 44 ; 48 ; 45 ; 59 ; 87 ; 282 ; 292 ; 130 ; 285 ; 272 ; 286 ; 302 ; 41 ; 34 ; 41 ; 41 ; 34 ; 41 ; 22 ; 21 ; 17 ; 30 ; 23 ; 20 ; 60 ; 33 ; 44 ; 49 ; 29 ; 48 ; 40 ; 19 ; 16 ; 15 ; 23 ; 11 ; 29 ; 21 ; 18 ; 19 ; 20 ; 25 ; 33 ; 24 ; 34 ; 21 ; 30 ; 25 ; 35 ; 40 ; 30 ; 42 ; 57 ; 57 ; 51 ; 30 ; 36 ; 37 ; 29 ; 27 ; 28 ; 52 ; 26 ; 30 ; 33 ; 177 ; 199 ; 32 ; 37 ; 161 ; 41 ; 44 ; 53 ; 49 ; 32 ; 135 ; 33 ; 19 ; 111 ; 149 ; 27 ; 136 ; 140 ; 31 ; 15 ; 29 ; 25 ; 18 ; 45 ; 15 ; 38 ; 15 ; 28 ; 23 ; 19 ; 23 ; 22 ; 20 ; 19 ; 23 ; 22 ; 15 ; 22 ; 31 ; 21 ; 36 ; 19 ; 37 ; 26 ; 39 ; 184 ; 50 ; 157 ; 164 ; 24 ; 68 ; 47 ; 14 ; 15 ; 24 ; 39 ; 26 ; 40 ; 159 ; 151 ; 47 ; 122 ; 45 ; 16 ; 23 ; 16 ; 18 ; 16 ; 15 ; 28 ; 17 ; 13 ; 21 ; 17 ; 134 ; 48 ; 64 ; 69 ; 12 ; 43 ; 14 ; 15 ; 18 ; 16 ; 10 ; 47 ; 15 ; 14 ; 12 ; 246 ; 316 ; 15 ; 15 ; 29 ; 21 ; 8 ; 16 ; 26 ; 10 ; 26 ; 26 ; 26 ; 21 ; 12 ; 27 ; 27 ; 25 ; 15 ; 15 ; 17 ; 55 ; 48 ; 21 ; 39 ; 27 ; 14 ; 26 ; 16 ; 16 ; 13 ; 14 ; 17 ; 13 ; 15 ; 10 ; 34 ; 16 ; 14 ; 12 ; 39 ; 35 ; 45 ; 45 ; 17 ; 14 ; 14 ; 14 ; 12 ; 15 ; 51 ; 23 ; 57 ; 50 ; 24 ; 23 ; 23 ; 28 ; 45 ; 42 ; 57 ; 27 ; 38 ; 22 ; 43 ; 31 ; 13 ; 15 ; 34 ; 19 ; 20 ; 23 ; 27 ; 32 ; 21 ; 24 ; 21 ; 28 ; 29 ; 67 ; 89 ; 82 ; 122 ; 131 ; 114 ; 149 ; 23 ; 98 ; 27 ; 30 ; 30 ; 69 ; 15 ; 10 ; 11 ; 14 ; 11 ; 14 ; 77 ; 41 ; 88 ; 78 ; 65 ; 157 ; 68 ; 67 ; 80 ; 137 ; 69 ; 91 ; 80 ; 90 ; 34 ; 73 ; 44 ; 36 ; 20 ; 35 ; 248 ; 168 ; 247 ; 22 ; 103 ; 62 ; 82 ; 51 ; 35 ; 34 ; 37 ; 14 ; 266 ; 254 ; 13 ; 282 ; 17 ; 19 ; 42 ; 27 ; 16 ; 19 ; 86 ; 29 ; 88 ; 98 ; 44 ; 65 ; 63 ; 75 ; 43 ; 80 ; 52 ; 48 ; 66 ; 100 ; 11 ; 16 ; 22 ; 11 ; 10 ; 16 ; 16 ; 10 ; 32 ; 10 ; 16 ; 67 ; 22 ; 28 ; 30 ; 15 ; 13 ; 18 ; 26 ; 30 ; 14 ; 24 ; 22 ; 25 ; 26 ; 22 ; 26 ; 20 ; 22 ; 21 ; 21 ; 69 ; 65 ; 62 ; 67 ; 40 ; 45 ; 574 ; 579 ; 537 ; 581 ; 527 ; 87 ; 84 ; 79 ; 92 ; 24 ; 67 ; 103 ; 190 ; 68 ; 60 ; 64 ; 31 ; 62 ; 37 ; 13 ; 13 ; 15 ; 79 ; 13 ; 98 ; 97 ; 11 ; 78 ; 56 ; 20 ; 17 ; 20 ; 19 ; 26 ; 14 ; 18 ; 12 ; 19 ; 16 ; 16 ; 12 ; 17 ; 15 ; 16 ; 17 ; 21 ; 17 ; 10 ; 17 ; 17 ; 18 ; 16 ; 26 ; 18 ; 20 ; 17 ; 21 ; 21 ; 20 ; 20 ; 13 ; 16 ; 17 ; 18 ; 24 ; 20 ; 120 ; 155 ; 38 ; 70 ; 149 ; 137 ; 29 ; 55 ; 136 ; 96 ; 60 ; 108 ; 39 ; 15 ; 111 ; 17 ; 19 ; 27 ; 19 ; 13 ; 19 ; 22 ; 20 ; 27 ; 132 ; 127 ; 85 ; 101 ; 21 ; 86 ; 84 ; 67 ; 66 ; 35"
evScore <- "4.7 ; 4.1 ; 3.9 ; 4.8 ; 4.6 ; 4.3 ; 2.8 ; 4.1 ; 3.4 ; 4.5 ; 3.8 ; 4.5 ; 4.6 ; 3.9 ; 3.9 ; 4.3 ; 4.5 ; 4.8 ; 4.6 ; 4.6 ; 4.9 ; 4.6 ; 4.5 ; 4.4 ; 4.6 ; 4.7 ; 4.5 ; 4.8 ; 4.9 ; 4.5 ; 4.4 ; 4.3 ; 4.1 ; 4.2 ; 3.5 ; 3.4 ; 4.5 ; 4.4 ; 4.4 ; 2.5 ; 4.3 ; 4.5 ; 4.8 ; 4.8 ; 4.4 ; 4.7 ; 4.4 ; 4.7 ; 4.5 ; 4 ; 4.3 ; 4.4 ; 4.5 ; 5 ; 4.9 ; 4.6 ; 5 ; 4.7 ; 5 ; 3.6 ; 3.7 ; 4.3 ; 4.1 ; 4.2 ; 4.7 ; 4.7 ; 3.5 ; 4.1 ; 4.2 ; 4 ; 4 ; 3.9 ; 4.4 ; 3.8 ; 3.5 ; 4.2 ; 3.5 ; 3.6 ; 2.9 ; 3.3 ; 3.3 ; 3.2 ; 4.6 ; 4.2 ; 4.3 ; 4.4 ; 4.1 ; 4.6 ; 4.4 ; 4.8 ; 4.3 ; 3.6 ; 4.3 ; 4 ; 4.2 ; 4.1 ; 4.1 ; 4.4 ; 4.3 ; 4.4 ; 4.4 ; 4.9 ; 5 ; 4.4 ; 4.8 ; 4.9 ; 4.3 ; 5 ; 4.7 ; 4.5 ; 3.5 ; 3.9 ; 4 ; 4 ; 3.7 ; 3.4 ; 3.3 ; 3.8 ; 3.9 ; 3.4 ; 3.7 ; 4.1 ; 3.7 ; 3.5 ; 3.5 ; 4.4 ; 3.4 ; 4.3 ; 3.7 ; 4.7 ; 3.9 ; 3.6 ; 4.5 ; 4.5 ; 4.8 ; 4.8 ; 4.7 ; 4.5 ; 4.3 ; 4.8 ; 4.1 ; 4.4 ; 4.3 ; 3.6 ; 4.5 ; 4.3 ; 4.4 ; 4.7 ; 4.8 ; 3.5 ; 3.8 ; 3.6 ; 4.2 ; 3.6 ; 4.4 ; 3.7 ; 4.3 ; 4.6 ; 4.6 ; 4.1 ; 3.6 ; 2.3 ; 4.3 ; 4.4 ; 3.6 ; 4.4 ; 3.9 ; 3.8 ; 3.4 ; 4.9 ; 4.1 ; 3.2 ; 4.2 ; 3.9 ; 4.9 ; 4.7 ; 4.4 ; 4.2 ; 4 ; 4.4 ; 3.9 ; 4.4 ; 3 ; 3.5 ; 2.8 ; 4.6 ; 4.3 ; 3.4 ; 3 ; 4.2 ; 4.3 ; 4.1 ; 4.6 ; 3.9 ; 3.5 ; 4 ; 4 ; 3.9 ; 3.3 ; 4 ; 3.8 ; 4.2 ; 4 ; 3.8 ; 3.3 ; 4.1 ; 4.7 ; 4.4 ; 4.8 ; 4.8 ; 4.6 ; 4.6 ; 4.8 ; 4.4 ; 4.7 ; 4.7 ; 3.3 ; 4.4 ; 4.3 ; 4.9 ; 4.4 ; 4.7 ; 4.3 ; 4.8 ; 4.5 ; 4.7 ; 3.3 ; 4.7 ; 4.6 ; 3.6 ; 4 ; 4.1 ; 4 ; 4.5 ; 4.6 ; 4.8 ; 4.6 ; 4.9 ; 3.1 ; 3.7 ; 3.7 ; 3.9 ; 3.9 ; 3.2 ; 4.4 ; 4.2 ; 4.7 ; 3.9 ; 3.6 ; 3.4 ; 4.4 ; 4.4 ; 4.1 ; 3.6 ; 3.5 ; 4.1 ; 3.8 ; 4 ; 4.8 ; 4.2 ; 4.6 ; 4.3 ; 4.8 ; 3.8 ; 4.5 ; 4.9 ; 4.9 ; 4.8 ; 4.7 ; 4.6 ; 4.3 ; 4.4 ; 4.5 ; 4.2 ; 4.8 ; 4.6 ; 4.9 ; 4.8 ; 4.8 ; 4.6 ; 4.7 ; 4.1 ; 3.8 ; 4 ; 4.1 ; 4 ; 4.1 ; 3.5 ; 4.1 ; 3.6 ; 4 ; 3.9 ; 3.8 ; 4.4 ; 4.7 ; 3.8 ; 4.1 ; 4.1 ; 4.7 ; 4.3 ; 4.4 ; 4.5 ; 3.1 ; 3.7 ; 4.5 ; 3 ; 4.6 ; 3.7 ; 3.6 ; 3.2 ; 3.3 ; 2.9 ; 4.2 ; 4.5 ; 3.8 ; 3.7 ; 3.7 ; 4 ; 3.7 ; 4.5 ; 3.8 ; 3.9 ; 4.6 ; 4.5 ; 4.2 ; 4 ; 3.8 ; 3.5 ; 2.7 ; 4 ; 4.6 ; 3.9 ; 4.5 ; 3.7 ; 2.4 ; 3.1 ; 2.5 ; 3 ; 4.5 ; 4.8 ; 4.9 ; 4.5 ; 4.6 ; 4.5 ; 4.9 ; 4.4 ; 4.6 ; 4.6 ; 5 ; 4.9 ; 4.6 ; 4.8 ; 4.9 ; 4.9 ; 4.9 ; 5 ; 4.5 ; 3.5 ; 3.8 ; 3.9 ; 3.9 ; 4.2 ; 4.1 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.9 ; 4.2 ; 4.5 ; 3.9 ; 4.4 ; 4 ; 3.6 ; 3.7 ; 2.7 ; 4.5 ; 4.4 ; 3.9 ; 3.6 ; 4.4 ; 4.4 ; 4.7 ; 4.5 ; 4.1 ; 3.7 ; 4.3 ; 3.5 ; 3.7 ; 4 ; 4 ; 3.1 ; 4.5 ; 4.8 ; 4.2 ; 4.9 ; 4.8 ; 3.5 ; 3.6 ; 4.4 ; 3.4 ; 3.9 ; 3.8 ; 4.8 ; 4.6 ; 5 ; 3.8 ; 4.2 ; 3.3 ; 4.7 ; 4.6 ; 4.6 ; 4 ; 4.2 ; 4.9 ; 4.5 ; 4.8 ; 3.8 ; 4.8 ; 5 ; 5 ; 4.9 ; 4.6 ; 5 ; 4.8 ; 4.9 ; 4.9 ; 3.9 ; 3.9 ; 4.5 ; 4.5 ; 3.3 ; 3.1 ; 2.8 ; 3.1 ; 4.2 ; 3.4 ; 3 ; 3.3 ; 3.6 ; 3.7 ; 3.6 ; 4.3 ; 4.1 ; 4.9 ; 4.8 ; 3.7 ; 3.9 ; 4.5 ; 3.6 ; 4.4 ; 3.4 ; 4.4 ; 4.5 ; 4.5 ; 4.5 ; 4.6 ; 4.1 ; 4.5 ; 3.5 ; 4.4 ; 4.4 ; 4.1"
evBty <- "5 ; 5 ; 5 ; 5 ; 3 ; 3 ; 3 ; 3.3 ; 3.3 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 7.3 ; 7.3 ; 7.3 ; 7.3 ; 7.3 ; 7.3 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.8 ; 4.8 ; 4.8 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4 ; 4 ; 4 ; 4 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 2.8 ; 3 ; 3 ; 3 ; 3 ; 3 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 7.8 ; 7.8 ; 3.8 ; 3.8 ; 3.8 ; 3.8 ; 3.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 3 ; 3 ; 3 ; 3 ; 3 ; 3 ; 3 ; 3 ; 5.2 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 2.7 ; 2.7 ; 2.7 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 2.3 ; 2.3 ; 2.3 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 3 ; 3 ; 3 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 6.2 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 8.2 ; 8.2 ; 8.2 ; 8.2 ; 6.5 ; 6.5 ; 6.5 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 7 ; 7 ; 7 ; 4.7 ; 3.8 ; 3.8 ; 3.8 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.7 ; 5.7 ; 5.7 ; 5.7 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 1.7 ; 1.7 ; 1.7 ; 1.7 ; 1.7 ; 1.7 ; 6.7 ; 6.7 ; 6.7 ; 3.7 ; 3.7 ; 3.7 ; 3.8 ; 3.8 ; 6.2 ; 6.2 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.7 ; 3.7 ; 3.5 ; 3.5 ; 3.5 ; 2.7 ; 5.7 ; 6 ; 6 ; 6.5 ; 6.5 ; 6.5 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 7.2 ; 7.2 ; 1.7 ; 1.7 ; 1.7 ; 5.2 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 6.2 ; 6.2 ; 6.2 ; 6.2 ; 6.2 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 5.2 ; 5.2 ; 4.2 ; 4.2 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 3 ; 3 ; 3 ; 6.3 ; 6.3 ; 6.3 ; 6.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 6.7 ; 6.7 ; 6.7 ; 6.7 ; 6.7 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 5.8 ; 5.8 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 7.8 ; 7.8 ; 7.8 ; 3.3 ; 3.3 ; 4.5 ; 4.5 ; 4.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 5.3 ; 5.3 ; 5.3 ; 5.3"
evals <- data.frame(score=as.numeric(strsplit(evScore, " ; ")[[1]]),
cls_students=as.integer(strsplit(evStudents, " ; ")[[1]]),
bty_avg=as.numeric(strsplit(evBty, " ; ")[[1]])
)
# Inspect evals
glimpse(evals)
## Observations: 463
## Variables: 3
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 2...
## $ bty_avg <dbl> 5.0, 5.0, 5.0, 5.0, 3.0, 3.0, 3.0, 3.3, 3.3, 3.2,...
# Inspect variable types
glimpse(evals)
## Observations: 463
## Variables: 3
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 2...
## $ bty_avg <dbl> 5.0, 5.0, 5.0, 5.0, 3.0, 3.0, 3.0, 3.3, 3.3, 3.2,...
# Remove non-factor variables from this vector
cat_vars <- c("rank", "ethnicity", "gender", "language",
"cls_level", "cls_profs", "cls_credits",
"pic_outfit", "pic_color")
# Recode cls_students as cls_type: evals
evals <- evals %>%
# Create new variable
mutate(cls_type = ifelse(cls_students <= 18, "small",
ifelse(cls_students >= 60, "large", "midsize")
)
)
# Scatterplot of score vs. bty_avg
ggplot(evals, aes(x=bty_avg, y=score)) +
geom_point()
# Scatterplot of score vs. bty_avg colored by cls_type
ggplot(data=evals, aes(x=bty_avg, y=score, color=cls_type)) +
geom_point()
Chapter 1 - Exploring categorical data
Exploring categorical data; based on a comic book dataset of DC vs Marvel:
Counts vs proportions - the proportions are often much more meaningful:
Distribution of one variable - the typical way to begin exploring a dataset:
Example code includes:
## ISSUE - do not have (and cannot find) this tibble
comCounts <- c(1573, 2490, 836, 1, 904, 7561, 4809, 1799, 2,
2250, 32, 17, 17, 0, 2, 449, 152, 121, 0, 257
)
comGender <- rep(rep(c("Female", "Male", "Other", NA), each=5),
times=comCounts
)
comAlign <- rep(rep(c("Bad", "Good", "Neutral", "Reformed Criminals", NA), times=4),
times=comCounts
)
comics <- tibble::as_tibble(data.frame(gender=factor(comGender),
align=factor(comAlign)
)
)
# Print the first rows of the data
comics
## # A tibble: 23,272 × 2
## gender align
## <fctr> <fctr>
## 1 Female Bad
## 2 Female Bad
## 3 Female Bad
## 4 Female Bad
## 5 Female Bad
## 6 Female Bad
## 7 Female Bad
## 8 Female Bad
## 9 Female Bad
## 10 Female Bad
## # ... with 23,262 more rows
# Check levels of align
levels(comics$align)
## [1] "Bad" "Good" "Neutral"
## [4] "Reformed Criminals"
# Check the levels of gender
levels(comics$gender)
## [1] "Female" "Male" "Other"
# Create a 2-way contingency table
table(comics$align, comics$gender)
##
## Female Male Other
## Bad 1573 7561 32
## Good 2490 4809 17
## Neutral 836 1799 17
## Reformed Criminals 1 2 0
# Remove align level
comics <- comics %>%
filter(align != "Reformed Criminals") %>%
droplevels()
# Create side-by-side barchart of gender by alignment
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "dodge")
# Create side-by-side barchart of alignment by gender
ggplot(comics, aes(x = gender, fill = align)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 90))
# Plot of gender by align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar()
# Plot proportion of gender, conditional on align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "fill")
# Change the order of the levels in align
comics$align <- factor(comics$align,
levels = c("Bad", "Neutral", "Good"))
# Create plot of align
ggplot(comics, aes(x = align)) +
geom_bar()
# Plot of alignment broken down by gender
ggplot(comics, aes(x = align)) +
geom_bar() +
facet_wrap(~ gender)
pieFlavor <- "cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin"
pies <- data.frame(flavor=factor(strsplit(pieFlavor, " ; ")[[1]]))
# Garden variety pie chart
ggplot(pies, aes(x=factor(1), fill=flavor)) +
geom_bar(position = "fill") +
coord_polar(theta="y") +
labs(x='', y='')
# Put levels of flavor in decending order
lev <- c("apple", "key lime", "boston creme", "blueberry", "cherry", "pumpkin", "strawberry")
pies$flavor <- factor(pies$flavor, levels = lev)
# Create barchart of flavor
ggplot(pies, aes(x = flavor)) +
geom_bar(fill = "chartreuse") +
theme(axis.text.x = element_text(angle = 90))
# If you prefer that it still be multi-colored like the pie
ggplot(pies, aes(x = flavor)) +
geom_bar(aes(fill=flavor)) +
theme(axis.text.x = element_text(angle = 90))
Chapter 2 - Exploring numerical data
Exploring numerical data - cars that were available for sale in a given year (428 x 19 tbl_df):
Distribution of one variable:
Box plots are based around three charcateristics of the data:
Visualization in higher dimensions:
Example code includes:
# Time to create some data . . .
carCityMPG <- "28 ; 28 ; 26 ; 26 ; 26 ; 29 ; 29 ; 26 ; 27 ; 26 ; 26 ; 32 ; 36 ; 32 ; 29 ; 29 ; 29 ; 26 ; 26 ; 26 ; 23 ; 26 ; 25 ; 24 ; 24 ; 24 ; NA ; 28 ; NA ; NA ; 28 ; 28 ; 24 ; 26 ; 26 ; 26 ; 26 ; 26 ; 32 ; 25 ; 25 ; 24 ; 22 ; 32 ; 32 ; 32 ; 35 ; 33 ; 35 ; 20 ; 21 ; 24 ; 22 ; 21 ; 22 ; 22 ; 22 ; 21 ; 21 ; 21 ; 21 ; 21 ; 20 ; 19 ; 26 ; 26 ; 32 ; 26 ; 46 ; 60 ; 19 ; 19 ; 20 ; NA ; 24 ; 20 ; 25 ; NA ; NA ; 21 ; 23 ; 24 ; 20 ; 20 ; 24 ; 20 ; 22 ; 21 ; 20 ; 24 ; 21 ; 24 ; 20 ; 59 ; 24 ; 24 ; 38 ; 24 ; 24 ; 22 ; 22 ; 20 ; 20 ; 20 ; 18 ; 20 ; 18 ; 23 ; 18 ; 18 ; 21 ; 19 ; 21 ; 22 ; 18 ; 17 ; 17 ; 21 ; 21 ; 17 ; 17 ; 18 ; 18 ; 18 ; 17 ; 22 ; 19 ; 17 ; 17 ; 19 ; 18 ; 18 ; 21 ; 20 ; 20 ; 20 ; 20 ; 21 ; 20 ; 19 ; 21 ; 21 ; 20 ; 21 ; 24 ; 22 ; 22 ; 20 ; 23 ; 20 ; 17 ; 18 ; 20 ; 18 ; 20 ; 19 ; 19 ; 20 ; 20 ; 20 ; 19 ; 20 ; 20 ; 18 ; 18 ; 21 ; 17 ; 18 ; 19 ; 18 ; 20 ; 18 ; 18 ; 20 ; 20 ; 20 ; 19 ; 19 ; 20 ; 19 ; 17 ; 17 ; NA ; 20 ; 20 ; 21 ; 21 ; 19 ; 21 ; 19 ; 18 ; 20 ; 20 ; 18 ; 20 ; 20 ; 18 ; 18 ; 20 ; 18 ; 18 ; 17 ; 17 ; 14 ; 19 ; 20 ; 18 ; 18 ; 18 ; 18 ; 18 ; 18 ; 18 ; 17 ; 17 ; 18 ; 18 ; 17 ; 18 ; 18 ; 17 ; 18 ; 18 ; 18 ; 17 ; 17 ; 17 ; 17 ; 17 ; 16 ; 16 ; 13 ; 20 ; 17 ; 19 ; 16 ; 18 ; 16 ; 21 ; 21 ; NA ; NA ; 21 ; 20 ; 19 ; 17 ; 15 ; 20 ; 20 ; 21 ; 16 ; 16 ; 20 ; 21 ; 17 ; 18 ; 18 ; 17 ; NA ; 20 ; 17 ; 17 ; 20 ; 19 ; 18 ; 18 ; 16 ; 16 ; 18 ; 23 ; 23 ; 18 ; 18 ; 16 ; 14 ; 13 ; 21 ; 17 ; 21 ; 21 ; 18 ; 20 ; 20 ; NA ; 18 ; 17 ; 18 ; 17 ; 20 ; 18 ; 20 ; 18 ; 24 ; 26 ; 14 ; 16 ; 14 ; 14 ; 15 ; NA ; 15 ; 15 ; 16 ; 13 ; 10 ; 15 ; 13 ; 13 ; 14 ; 17 ; 16 ; 16 ; 15 ; 19 ; 16 ; 15 ; 17 ; 17 ; 16 ; 16 ; 12 ; 15 ; 13 ; 18 ; 13 ; 13 ; 14 ; 16 ; 17 ; 15 ; 16 ; 19 ; 14 ; 21 ; 18 ; 18 ; 18 ; 13 ; 15 ; 15 ; 19 ; 18 ; 21 ; 21 ; 20 ; 20 ; 16 ; 12 ; 18 ; 22 ; 21 ; 17 ; 19 ; 22 ; 18 ; 15 ; 19 ; 22 ; 17 ; 26 ; 19 ; 16 ; 15 ; 26 ; 18 ; 19 ; 19 ; 16 ; 19 ; NA ; 20 ; 29 ; 19 ; 24 ; 31 ; 21 ; 21 ; 24 ; 29 ; 24 ; 22 ; 18 ; 22 ; 20 ; 14 ; 19 ; 19 ; 18 ; 20 ; 18 ; 17 ; 16 ; 18 ; 18 ; 16 ; 18 ; 16 ; 19 ; 18 ; 19 ; 19 ; 18 ; 19 ; 19 ; 13 ; 14 ; 18 ; 15 ; 13 ; 16 ; 16 ; 16 ; 16 ; 15 ; 14 ; 24 ; 19 ; 17 ; NA ; 15 ; 24 ; 15 ; 17 ; 14 ; 21 ; 22 ; 16 ; 14"
carSUV <- "0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0"
carNCyl <- "4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 6 ; 6 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 3 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 6 ; 4 ; 4 ; 6 ; 4 ; 6 ; 4 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 8 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 6 ; 8 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 4 ; 6 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 6 ; 8 ; 5 ; 5 ; 5 ; 6 ; 5 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 8 ; 8 ; 6 ; 8 ; 12 ; 6 ; 8 ; 6 ; 8 ; 8 ; 8 ; 4 ; 4 ; 8 ; 12 ; 5 ; 5 ; 6 ; 6 ; 8 ; 4 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 10 ; 6 ; 8 ; 8 ; 4 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 4 ; 4 ; -1 ; -1 ; 8 ; 8 ; 12 ; 4 ; 6 ; 6 ; 6 ; 4 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 4 ; 4 ; 8 ; 8 ; 8 ; 8 ; 8 ; 10 ; 8 ; 6 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 4 ; 6 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 4 ; 6 ; 8 ; 6 ; 4 ; 4 ; 6 ; 6 ; 4 ; 6 ; 8 ; 6 ; 6 ; 6 ; 4 ; 6 ; 6 ; 8 ; 4 ; 6 ; 6 ; 6 ; 8 ; 6 ; 4 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 8 ; 4 ; 5 ; 6 ; 6 ; 6 ; 6 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 4 ; 6 ; 8 ; 8 ; 6 ; 6 ; 6 ; 8 ; 8 ; 4 ; 4 ; 8 ; 8 ; 6 ; 4 ; 6 ; 6 ; 8 ; 4 ; 4 ; 6 ; 6"
carHP <- "103 ; 103 ; 140 ; 140 ; 140 ; 132 ; 132 ; 130 ; 110 ; 130 ; 130 ; 115 ; 117 ; 115 ; 103 ; 103 ; 103 ; 138 ; 138 ; 138 ; 138 ; 104 ; 104 ; 124 ; 124 ; 124 ; 148 ; 115 ; 120 ; 120 ; 126 ; 126 ; 140 ; 140 ; 140 ; 140 ; 140 ; 140 ; 108 ; 155 ; 155 ; 119 ; 119 ; 130 ; 130 ; 130 ; 108 ; 108 ; 108 ; 175 ; 180 ; 145 ; 200 ; 180 ; 150 ; 150 ; 150 ; 200 ; 200 ; 150 ; 150 ; 170 ; 155 ; 201 ; 160 ; 160 ; 127 ; 160 ; 93 ; 73 ; 170 ; 170 ; 170 ; 160 ; 160 ; 155 ; 163 ; 160 ; 120 ; 175 ; 165 ; 140 ; 175 ; 200 ; 140 ; 182 ; 165 ; 165 ; 155 ; 157 ; 210 ; 157 ; 225 ; 110 ; 115 ; 180 ; 100 ; 150 ; 200 ; 200 ; 170 ; 184 ; 205 ; 200 ; 240 ; 200 ; 240 ; 200 ; 200 ; 250 ; 200 ; 232 ; 220 ; 150 ; 232 ; 224 ; 224 ; 240 ; 240 ; 194 ; 194 ; 260 ; 280 ; 192 ; 195 ; 189 ; 215 ; 224 ; 224 ; 201 ; 205 ; 230 ; 245 ; 265 ; 265 ; 170 ; 200 ; 165 ; 165 ; 212 ; 210 ; 210 ; 225 ; 200 ; 115 ; 170 ; 170 ; 270 ; 170 ; 220 ; 220 ; 220 ; 220 ; 220 ; 184 ; 184 ; 184 ; 225 ; 225 ; 225 ; 184 ; 205 ; 205 ; 255 ; 255 ; 200 ; 239 ; 260 ; 255 ; 227 ; 225 ; 215 ; 215 ; 232 ; 232 ; 168 ; 168 ; 215 ; 215 ; 215 ; 224 ; 302 ; 275 ; 210 ; 210 ; 220 ; 250 ; 212 ; 210 ; 190 ; 270 ; 208 ; 247 ; 300 ; 208 ; 194 ; 225 ; 225 ; 220 ; 220 ; 250 ; 300 ; 330 ; 340 ; 225 ; 225 ; 325 ; 325 ; 325 ; 240 ; 275 ; 300 ; 275 ; 340 ; 340 ; 235 ; 294 ; 390 ; 294 ; 294 ; 390 ; 220 ; 300 ; 290 ; 280 ; 280 ; 239 ; 239 ; 239 ; 349 ; 302 ; 493 ; 215 ; 302 ; 221 ; 302 ; 275 ; 302 ; 210 ; 210 ; 335 ; 420 ; 197 ; 242 ; 268 ; 290 ; 450 ; 180 ; 225 ; 250 ; 333 ; 333 ; 184 ; 225 ; 320 ; 350 ; 350 ; 215 ; 500 ; 193 ; 260 ; 280 ; 240 ; 172 ; 294 ; 294 ; 390 ; 390 ; 300 ; 142 ; 142 ; 197 ; 238 ; 302 ; 493 ; 493 ; 192 ; 349 ; 210 ; 210 ; 271 ; 287 ; 287 ; 340 ; 315 ; 315 ; 315 ; 477 ; 228 ; 258 ; 227 ; 300 ; 180 ; 138 ; 295 ; 320 ; 295 ; 295 ; 230 ; 310 ; 232 ; 275 ; 285 ; 325 ; 316 ; 275 ; 300 ; 305 ; 240 ; 265 ; 225 ; 325 ; 275 ; 185 ; 275 ; 210 ; 240 ; 193 ; 195 ; 192 ; 282 ; 235 ; 235 ; 230 ; 302 ; 292 ; 288 ; 210 ; 215 ; 215 ; 240 ; 185 ; 340 ; 143 ; 185 ; 245 ; 230 ; 325 ; 220 ; 268 ; 165 ; 201 ; 160 ; 160 ; 173 ; 150 ; 190 ; 217 ; 174 ; 130 ; 160 ; 180 ; 165 ; 161 ; 220 ; 340 ; 184 ; 200 ; 250 ; 130 ; 155 ; 280 ; 315 ; 104 ; 215 ; 168 ; 221 ; 302 ; 155 ; 160 ; 245 ; 130 ; 250 ; 140 ; 108 ; 165 ; 165 ; 155 ; 130 ; 115 ; 170 ; 270 ; 170 ; 208 ; 190 ; 185 ; 180 ; 215 ; 150 ; 215 ; 193 ; 190 ; 240 ; 240 ; 195 ; 200 ; 201 ; 240 ; 240 ; 185 ; 185 ; 185 ; 230 ; 230 ; 345 ; 295 ; 175 ; 200 ; 300 ; 300 ; 210 ; 210 ; 215 ; 231 ; 300 ; 143 ; 175 ; 285 ; 300 ; 190 ; 143 ; 207 ; 180 ; 305 ; 165 ; 142 ; 190 ; 190"
carMSRP <- "11690 ; 12585 ; 14610 ; 14810 ; 16385 ; 13670 ; 15040 ; 13270 ; 13730 ; 15460 ; 15580 ; 13270 ; 14170 ; 15850 ; 10539 ; 11839 ; 11939 ; 13839 ; 15389 ; 15389 ; 16040 ; 10280 ; 11155 ; 12360 ; 13580 ; 14630 ; 15500 ; 16999 ; 14622 ; 16722 ; 12740 ; 14740 ; 15495 ; 10995 ; 14300 ; 15825 ; 14850 ; 16350 ; 12965 ; 12884 ; 14500 ; 12269 ; 15568 ; 14085 ; 15030 ; 15295 ; 10760 ; 11560 ; 11290 ; 22180 ; 21900 ; 18995 ; 20370 ; 21825 ; 17985 ; 22000 ; 19090 ; 21840 ; 22035 ; 18820 ; 20220 ; 19135 ; 20320 ; 22735 ; 19860 ; 22260 ; 17750 ; 19490 ; 20140 ; 19110 ; 19339 ; 20339 ; 18435 ; 17200 ; 19270 ; 21595 ; 19999 ; 19312 ; 17232 ; 19240 ; 17640 ; 18825 ; 22450 ; 22395 ; 17735 ; 21410 ; 19945 ; 20445 ; 17262 ; 19560 ; 22775 ; 19635 ; 21965 ; 20510 ; 18715 ; 19825 ; 21055 ; 21055 ; 23820 ; 26990 ; 25940 ; 28495 ; 26470 ; 24895 ; 28345 ; 25000 ; 27995 ; 23495 ; 24225 ; 29865 ; 24130 ; 26860 ; 25955 ; 25215 ; 24885 ; 24345 ; 27370 ; 23760 ; 26960 ; 24589 ; 26189 ; 28495 ; 29795 ; 29995 ; 26000 ; 26060 ; 28370 ; 24695 ; 29595 ; 23895 ; 29282 ; 25700 ; 23290 ; 27490 ; 29440 ; 23675 ; 24295 ; 25645 ; 27145 ; 29345 ; 26560 ; 25920 ; 26510 ; 23785 ; 23215 ; 23955 ; 25135 ; 33195 ; 35940 ; 31840 ; 33430 ; 34480 ; 36640 ; 39640 ; 30795 ; 37995 ; 30245 ; 35495 ; 36995 ; 37245 ; 39995 ; 32245 ; 35545 ; 30835 ; 33295 ; 30950 ; 30315 ; 32445 ; 31145 ; 33995 ; 32350 ; 31045 ; 32415 ; 32495 ; 36895 ; 32280 ; 33480 ; 35920 ; 37630 ; 38830 ; 30895 ; 34495 ; 35995 ; 30860 ; 33360 ; 35105 ; 39465 ; 31545 ; 30920 ; 33180 ; 39235 ; 31745 ; 34845 ; 37560 ; 37730 ; 37885 ; 43755 ; 46100 ; 42490 ; 44240 ; 42840 ; 49690 ; 69190 ; 48040 ; 44295 ; 44995 ; 54995 ; 69195 ; 73195 ; 40720 ; 45445 ; 50595 ; 47955 ; 42845 ; 52545 ; 43895 ; 49995 ; 63120 ; 68995 ; 59995 ; 74995 ; 41010 ; 48450 ; 55750 ; 40095 ; 43495 ; 41815 ; 44925 ; 50470 ; 52120 ; 94820 ; 128420 ; 45707 ; 52800 ; 48170 ; 57270 ; 74320 ; 86970 ; 40670 ; 43175 ; 65000 ; 75000 ; 40565 ; 42565 ; 45210 ; 89765 ; 84600 ; 35940 ; 37390 ; 40590 ; 48195 ; 56595 ; 33895 ; 41045 ; 76200 ; 44535 ; 51535 ; 34495 ; 81795 ; 18345 ; 29380 ; 37530 ; 33260 ; 18739 ; 69995 ; 74995 ; 81995 ; 86995 ; 63200 ; 22388 ; 25193 ; 25700 ; 27200 ; 90520 ; 121770 ; 126670 ; 40320 ; 56170 ; 25092 ; 26992 ; 29562 ; 26910 ; 34390 ; 33500 ; 79165 ; 84165 ; 76765 ; 192465 ; 43365 ; 52365 ; 25045 ; 31545 ; 22570 ; 25130 ; 52795 ; 46995 ; 42735 ; 41465 ; 32235 ; 41475 ; 34560 ; 31890 ; 35725 ; 46265 ; 49995 ; 31849 ; 52775 ; 33840 ; 35695 ; 36945 ; 37000 ; 52195 ; 37895 ; 26545 ; 30295 ; 29670 ; 27560 ; 20449 ; 27905 ; 19635 ; 72250 ; 45700 ; 64800 ; 39195 ; 42915 ; 76870 ; 46470 ; 29995 ; 30492 ; 33112 ; 27339 ; 21595 ; 56665 ; 20585 ; 23699 ; 27710 ; 27930 ; 54765 ; 35515 ; 41250 ; 20255 ; 22515 ; 19860 ; 18690 ; 21589 ; 20130 ; 25520 ; 39250 ; 25995 ; 21087 ; 18892 ; 20939 ; 17163 ; 20290 ; 40840 ; 49090 ; 32845 ; 22225 ; 31230 ; 17475 ; 22290 ; 34895 ; 36395 ; 11905 ; 32455 ; 33780 ; 50670 ; 60670 ; 22595 ; 17495 ; 28739 ; 17045 ; 40845 ; 23560 ; 14165 ; 21445 ; 23895 ; 16497 ; 16695 ; 19005 ; 24955 ; 40235 ; 26135 ; 35145 ; 26395 ; 27020 ; 27490 ; 38380 ; 21795 ; 32660 ; 26930 ; 25640 ; 24950 ; 27450 ; 20615 ; 28750 ; 33995 ; 24780 ; 32780 ; 28790 ; 23845 ; 31370 ; 23495 ; 28800 ; 52975 ; 36100 ; 18760 ; 20310 ; 40340 ; 41995 ; 17630 ; 20300 ; 20215 ; 22010 ; 33540 ; 14385 ; 16530 ; 25717 ; 29322 ; 25395 ; 14840 ; 22350 ; 19479 ; 26650 ; 24520 ; 12800 ; 16495 ; 25935"
carWidth <- "66 ; 66 ; 69 ; 68 ; 69 ; 67 ; 67 ; 67 ; 67 ; 67 ; 67 ; 67 ; 67 ; 68 ; 66 ; 66 ; 66 ; 68 ; 68 ; 68 ; 72 ; 66 ; 66 ; 68 ; 68 ; 68 ; NA ; 67 ; 67 ; 67 ; 67 ; 67 ; 68 ; 67 ; 67 ; 67 ; 68 ; 68 ; 67 ; 68 ; 68 ; 68 ; 68 ; 67 ; 67 ; 67 ; 65 ; 65 ; 65 ; 73 ; 73 ; 70 ; 70 ; 73 ; 67 ; 67 ; 71 ; 71 ; 75 ; 71 ; 71 ; 67 ; 73 ; 73 ; 71 ; 71 ; 68 ; 67 ; 68 ; 67 ; 72 ; 72 ; 72 ; NA ; 70 ; 73 ; 67 ; 72 ; 67 ; 70 ; 67 ; 70 ; 70 ; 74 ; 68 ; 69 ; 69 ; 69 ; 72 ; 71 ; 71 ; 72 ; 72 ; 68 ; 68 ; 68 ; 68 ; 68 ; 68 ; 69 ; 70 ; 69 ; 74 ; 73 ; 73 ; 73 ; 73 ; 70 ; 73 ; 74 ; 74 ; 74 ; 67 ; 64 ; 75 ; 78 ; 78 ; 72 ; 71 ; 72 ; 72 ; 69 ; 72 ; 70 ; 73 ; 68 ; 68 ; 78 ; 78 ; 73 ; 70 ; 72 ; 70 ; 72 ; 72 ; 70 ; 74 ; 69 ; 69 ; 69 ; 72 ; 71 ; 72 ; 68 ; 68 ; 69 ; 68 ; 72 ; 70 ; 70 ; 70 ; 70 ; 71 ; 71 ; 69 ; 69 ; 69 ; 69 ; 69 ; 69 ; 73 ; 74 ; 75 ; 71 ; 74 ; 69 ; 78 ; 69 ; 70 ; 70 ; 71 ; 68 ; 68 ; 73 ; 73 ; 68 ; 68 ; 68 ; 68 ; 68 ; 78 ; 78 ; 74 ; 69 ; 69 ; 71 ; 71 ; 69 ; 72 ; 69 ; 69 ; 71 ; 71 ; 71 ; 72 ; 72 ; 72 ; 72 ; 70 ; 70 ; 71 ; 71 ; 75 ; 70 ; 69 ; 73 ; 73 ; 75 ; 75 ; 75 ; 74 ; 74 ; 75 ; 70 ; 73 ; 72 ; 72 ; 72 ; 73 ; 73 ; 73 ; 71 ; 71 ; 72 ; 73 ; 73 ; 78 ; 78 ; 78 ; 68 ; 73 ; 73 ; 69 ; 69 ; 71 ; 71 ; 73 ; 73 ; 69 ; 69 ; 75 ; 75 ; 72 ; 72 ; 72 ; 71 ; 78 ; 73 ; 73 ; 73 ; 70 ; 70 ; 70 ; 70 ; 72 ; 74 ; 74 ; 70 ; 75 ; 73 ; 73 ; 72 ; 69 ; 69 ; 71 ; 71 ; 71 ; 71 ; 72 ; 66 ; 66 ; NA ; NA ; 72 ; 72 ; 72 ; 68 ; 68 ; 69 ; 69 ; 70 ; 72 ; 72 ; 73 ; 70 ; 72 ; 70 ; 72 ; 70 ; 70 ; 69 ; 69 ; 68 ; 67 ; 79 ; 73 ; 79 ; 79 ; 76 ; 80 ; 79 ; 75 ; 79 ; 79 ; 81 ; 76 ; 80 ; 79 ; 78 ; 77 ; 73 ; 74 ; 75 ; 74 ; 75 ; 72 ; 77 ; 70 ; 72 ; 73 ; 76 ; 74 ; 76 ; 73 ; 76 ; 71 ; 72 ; 72 ; 74 ; 75 ; 72 ; 74 ; 76 ; 72 ; 70 ; 74 ; 72 ; 76 ; 76 ; 75 ; 67 ; 70 ; 70 ; 72 ; 73 ; 72 ; 67 ; 74 ; 71 ; 72 ; 69 ; 70 ; 67 ; 68 ; 71 ; 70 ; 69 ; 70 ; 79 ; 67 ; 73 ; 76 ; 76 ; 66 ; 68 ; 68 ; 71 ; 71 ; 73 ; 67 ; 74 ; 70 ; 71 ; 69 ; 67 ; 68 ; 69 ; 68 ; 70 ; 68 ; 69 ; 69 ; 68 ; 73 ; 78 ; 72 ; 79 ; 79 ; 79 ; 79 ; 77 ; 78 ; 76 ; 76 ; 75 ; 72 ; 77 ; 78 ; 78 ; 72 ; 72 ; 72 ; 77 ; 77 ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA"
carHwyMPG <- as.integer(strsplit("34 ; 34 ; 37 ; 37 ; 37 ; 36 ; 36 ; 33 ; 36 ; 33 ; 33 ; 38 ; 44 ; 38 ; 33 ; 33 ; 33 ; 34 ; 34 ; 34 ; 30 ; 33 ; 32 ; 32 ; 32 ; 32 ; NA ; 37 ; NA ; NA ; 35 ; 35 ; 33 ; 35 ; 35 ; 35 ; 35 ; 35 ; 38 ; 31 ; 31 ; 31 ; 30 ; 40 ; 40 ; 40 ; 43 ; 39 ; 43 ; 30 ; 32 ; 34 ; 30 ; 32 ; 29 ; 29 ; 30 ; 28 ; 29 ; 28 ; 28 ; 28 ; 27 ; 26 ; 34 ; 34 ; 37 ; 30 ; 51 ; 66 ; 27 ; 27 ; 27 ; NA ; 32 ; 27 ; 34 ; NA ; NA ; 26 ; 28 ; 32 ; 29 ; 30 ; 33 ; 28 ; 28 ; 28 ; 27 ; 33 ; 29 ; 33 ; 29 ; 51 ; 31 ; 31 ; 46 ; 31 ; 31 ; 29 ; 31 ; 29 ; 29 ; 30 ; 28 ; 30 ; 28 ; 32 ; 28 ; 27 ; 29 ; 27 ; 27 ; 30 ; 27 ; 25 ; 25 ; 30 ; 30 ; 26 ; 26 ; 26 ; 26 ; 26 ; 25 ; 30 ; 26 ; 25 ; 25 ; 26 ; 25 ; 26 ; 26 ; 28 ; 28 ; 29 ; 30 ; 28 ; 27 ; 26 ; 29 ; 29 ; 29 ; 30 ; 30 ; 31 ; 29 ; 28 ; 30 ; 28 ; 26 ; 25 ; 27 ; 25 ; 29 ; 27 ; 27 ; 30 ; 30 ; 29 ; 28 ; 29 ; 29 ; 25 ; 27 ; 28 ; 25 ; 26 ; 26 ; 25 ; 29 ; 25 ; 24 ; 26 ; 26 ; 25 ; 25 ; 26 ; 26 ; 27 ; 25 ; 23 ; NA ; 28 ; 28 ; 29 ; 29 ; 26 ; 29 ; 26 ; 25 ; 27 ; 28 ; 25 ; 28 ; 27 ; 24 ; 24 ; 27 ; 25 ; 25 ; 24 ; 24 ; 20 ; 28 ; 30 ; 26 ; 26 ; 26 ; 28 ; 26 ; 26 ; 26 ; 23 ; 23 ; 26 ; 28 ; 24 ; 28 ; 28 ; 24 ; 25 ; 23 ; 25 ; 24 ; 24 ; 25 ; 25 ; 25 ; 21 ; 24 ; 19 ; 26 ; 22 ; 27 ; 20 ; 26 ; 24 ; 29 ; 30 ; NA ; NA ; 28 ; 26 ; 26 ; 24 ; 22 ; 28 ; 28 ; 29 ; 24 ; 23 ; 28 ; 29 ; 25 ; 25 ; 25 ; 25 ; NA ; 29 ; 25 ; 24 ; 25 ; 26 ; 26 ; 26 ; 23 ; 23 ; 23 ; 28 ; 28 ; 25 ; 24 ; 23 ; 21 ; 19 ; 29 ; 22 ; 28 ; 28 ; 26 ; 26 ; 26 ; NA ; 26 ; 24 ; 26 ; 24 ; 29 ; 26 ; 27 ; 24 ; 33 ; 32 ; 18 ; 21 ; 18 ; 18 ; 21 ; NA ; 19 ; 19 ; 19 ; 17 ; 12 ; 20 ; 18 ; 19 ; 17 ; 23 ; 23 ; 22 ; 21 ; 26 ; 21 ; 20 ; 22 ; 21 ; 21 ; 19 ; 16 ; 19 ; 17 ; 24 ; 18 ; 14 ; 17 ; 21 ; 21 ; 19 ; 21 ; 26 ; 18 ; 26 ; 22 ; 21 ; 24 ; 17 ; 20 ; 20 ; 22 ; 23 ; 25 ; 24 ; 26 ; 24 ; 19 ; 16 ; 21 ; 25 ; 27 ; 20 ; 22 ; 27 ; 25 ; 21 ; 26 ; 30 ; 23 ; 33 ; 26 ; 22 ; 19 ; 33 ; 24 ; 25 ; 27 ; 24 ; 26 ; NA ; 25 ; 36 ; 29 ; 34 ; 35 ; 28 ; 28 ; 29 ; 36 ; 30 ; 31 ; 25 ; 29 ; 27 ; 17 ; 26 ; 26 ; 25 ; 26 ; 25 ; 23 ; 20 ; 25 ; 25 ; 22 ; 25 ; 23 ; 26 ; 25 ; 26 ; 26 ; 24 ; 27 ; 27 ; 17 ; 18 ; 23 ; 21 ; 17 ; 19 ; 22 ; 22 ; 21 ; 19 ; 18 ; 29 ; 24 ; 20 ; NA ; 19 ; 29 ; 19 ; 20 ; 18 ; 28 ; 27 ; 20 ; 17", " ; ")[[1]])
## Warning: NAs introduced by coercion
cars <- data.frame(city_mpg=as.integer(strsplit(carCityMPG, " ; ")[[1]]),
suv=as.logical(as.integer(strsplit(carSUV, " ; ")[[1]])),
ncyl=as.integer(strsplit(carNCyl, " ; ")[[1]]),
horsepwr=as.integer(strsplit(carHP, " ; ")[[1]]),
msrp=as.integer(strsplit(carMSRP, " ; ")[[1]]),
width=as.integer(strsplit(carWidth, " ; ")[[1]]),
hwy_mpg=carHwyMPG
)
## Warning in data.frame(city_mpg = as.integer(strsplit(carCityMPG, " ; ")
## [[1]]), : NAs introduced by coercion
## Warning in data.frame(city_mpg = as.integer(strsplit(carCityMPG, " ; ")
## [[1]]), : NAs introduced by coercion
colSums(is.na(cars))
## city_mpg suv ncyl horsepwr msrp width hwy_mpg
## 14 0 0 0 0 28 14
# Learn data structure
str(cars)
## 'data.frame': 428 obs. of 7 variables:
## $ city_mpg: int 28 28 26 26 26 29 29 26 27 26 ...
## $ suv : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ncyl : int 4 4 4 4 4 4 4 4 4 4 ...
## $ horsepwr: int 103 103 140 140 140 132 132 130 110 130 ...
## $ msrp : int 11690 12585 14610 14810 16385 13670 15040 13270 13730 15460 ...
## $ width : int 66 66 69 68 69 67 67 67 67 67 ...
## $ hwy_mpg : int 34 34 37 37 37 36 36 33 36 33 ...
# Create faceted histogram
ggplot(cars, aes(x = city_mpg)) +
geom_histogram() +
facet_grid(. ~ suv)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14 rows containing non-finite values (stat_bin).
# Filter cars with 4, 6, 8 cylinders
common_cyl <- filter(cars, ncyl %in% c(4, 6, 8))
# Create box plots of city mpg by ncyl
ggplot(common_cyl, aes(x = as.factor(ncyl), y = city_mpg)) +
geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
# Create overlaid density plots for same data
ggplot(common_cyl, aes(x = city_mpg, fill = as.factor(ncyl))) +
geom_density(alpha = .3)
## Warning: Removed 11 rows containing non-finite values (stat_density).
# Create hist of horsepwr
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram() +
ggtitle("Histogram of Horsepower")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Create hist of horsepwr for affordable cars
cars %>%
filter(msrp < 25000) %>%
ggplot(aes(x=horsepwr)) +
geom_histogram() +
xlim(c(90, 550)) +
ggtitle("Histogram of Horsepower\n(Affordable Cars Only)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).
# Create hist of horsepwr with binwidth of 3
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram(binwidth = 3) +
ggtitle("Histogram of Horsepower\n(Bucket Size=3)")
# Create hist of horsepwr with binwidth of 30
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram(binwidth = 30) +
ggtitle("Histogram of Horsepower\n(Bucket Size=30)")
# Create hist of horsepwr with binwidth of 60
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram(binwidth = 60) +
ggtitle("Histogram of Horsepower\n(Bucket Size=60)")
# Construct box plot of msrp
cars %>%
ggplot(aes(x = 1, y = msrp)) +
geom_boxplot()
# Exclude outliers from data
cars_no_out <- cars %>%
filter(msrp < 100000)
# Create plot of city_mpg
cars %>%
ggplot(aes(x=city_mpg)) +
geom_density()
## Warning: Removed 14 rows containing non-finite values (stat_density).
# Create plot of width
cars %>%
ggplot(aes(x=width)) +
geom_density()
## Warning: Removed 28 rows containing non-finite values (stat_density).
# Create plot of city_mpg
cars %>%
ggplot(aes(x=factor(1), y=city_mpg)) +
geom_boxplot()
## Warning: Removed 14 rows containing non-finite values (stat_boxplot).
# Create plot of width
cars %>%
ggplot(aes(x=factor(1), y=width)) +
geom_boxplot()
## Warning: Removed 28 rows containing non-finite values (stat_boxplot).
# Facet hists using hwy mileage and ncyl
common_cyl %>%
ggplot(aes(x = hwy_mpg)) +
geom_histogram() +
facet_grid(ncyl ~ suv) +
ggtitle("Histogram of HighwayMPG\n(By Cylinders vs. SUV)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 11 rows containing non-finite values (stat_bin).
Chapter 3 - Numerical summaries
Measures of center - “what is the typical value”?:
Measures of variability - what are the typical distances from “typical”?:
Shape and transformations - modality and skew:
Outliers - observations with extreme values:
Example code includes:
# Create the data assumed for the exercises
data(gapminder, package="gapminder")
gapminder <- tibble::as_tibble(gapminder)
str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
# Create dataset of 2007 data
gap2007 <- filter(gapminder, year == 2007)
# Compute groupwise mean and median lifeExp
gap2007 %>%
group_by(continent) %>%
summarize(mean(lifeExp),
median(lifeExp)
)
## # A tibble: 5 × 3
## continent `mean(lifeExp)` `median(lifeExp)`
## <fctr> <dbl> <dbl>
## 1 Africa 54.80604 52.9265
## 2 Americas 73.60812 72.8990
## 3 Asia 70.72848 72.3960
## 4 Europe 77.64860 78.6085
## 5 Oceania 80.71950 80.7195
# Generate box plots of lifeExp for each continent
gap2007 %>%
ggplot(aes(x = continent, y = lifeExp)) +
geom_boxplot()
# Compute groupwise measures of spread
gap2007 %>%
group_by(continent) %>%
summarize(sd(lifeExp),
IQR(lifeExp),
n()
)
## # A tibble: 5 × 4
## continent `sd(lifeExp)` `IQR(lifeExp)` `n()`
## <fctr> <dbl> <dbl> <int>
## 1 Africa 9.6307807 11.61025 52
## 2 Americas 4.4409476 4.63200 25
## 3 Asia 7.9637245 10.15200 33
## 4 Europe 2.9798127 4.78250 30
## 5 Oceania 0.7290271 0.51550 2
# Generate overlaid density plots
gap2007 %>%
ggplot(aes(x = lifeExp, fill = continent)) +
geom_density(alpha = 0.3)
# Compute stats for lifeExp in Americas
gap2007 %>%
filter(continent == "Americas") %>%
summarize(mean(lifeExp),
sd(lifeExp)
)
## # A tibble: 1 × 2
## `mean(lifeExp)` `sd(lifeExp)`
## <dbl> <dbl>
## 1 73.60812 4.440948
# Compute stats for population
gap2007 %>%
summarize(median(pop),
IQR(pop)
)
## # A tibble: 1 × 2
## `median(pop)` `IQR(pop)`
## <dbl> <dbl>
## 1 10517531 26702008
# Create density plot of old variable
gap2007 %>%
ggplot(aes(x = pop)) +
geom_density()
# Transform the skewed pop variable
gap2007 <- gap2007 %>%
mutate(log_pop = log(pop))
# Create density plot of new variable
gap2007 %>%
ggplot(aes(x = log_pop)) +
geom_density()
# Filter for Asia, add column indicating outliers
gap_asia <- gap2007 %>%
filter(continent == "Asia") %>%
mutate(is_outlier = (lifeExp < 50))
# Remove outliers, create box plot of lifeExp
gap_asia %>%
filter(!is_outlier) %>%
ggplot(aes(x = factor(1), y = lifeExp)) +
geom_boxplot()
Chapter 4 - Case Study
Introducing the data - the email dataset (tibble 3,921 x 21):
Check-in #1:
Check-in #2:
Example code includes:
data(email, package="openintro")
email <- tibble::as_tibble(email)
str(email)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3921 obs. of 21 variables:
## $ spam : num 0 0 0 0 0 0 0 0 0 0 ...
## $ to_multiple : num 0 0 0 0 0 0 1 1 0 0 ...
## $ from : num 1 1 1 1 1 1 1 1 1 1 ...
## $ cc : int 0 0 0 0 0 0 0 1 0 0 ...
## $ sent_email : num 0 0 0 0 0 0 1 1 0 0 ...
## $ time : POSIXct, format: "2012-01-01 00:16:41" "2012-01-01 01:03:59" ...
## $ image : num 0 0 0 0 0 0 0 1 0 0 ...
## $ attach : num 0 0 0 0 0 0 0 1 0 0 ...
## $ dollar : num 0 0 4 0 0 0 0 0 0 0 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num 0 0 1 0 0 0 0 0 0 0 ...
## $ viagra : num 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num 0 0 0 0 2 2 0 0 0 0 ...
## $ num_char : num 11.37 10.5 7.77 13.26 1.23 ...
## $ line_breaks : int 202 202 192 255 29 25 193 237 69 68 ...
## $ format : num 1 1 1 1 0 0 1 1 0 1 ...
## $ re_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_subj: num 0 0 0 0 0 0 0 0 0 0 ...
## $ urgent_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_mess: num 0 1 6 48 1 1 1 18 1 0 ...
## $ number : Factor w/ 3 levels "none","small",..: 3 2 2 2 1 1 3 2 2 2 ...
# Compute summary statistics
email %>%
group_by(spam) %>%
summarize(median(num_char), IQR(num_char))
## # A tibble: 2 × 3
## spam `median(num_char)` `IQR(num_char)`
## <dbl> <dbl> <dbl>
## 1 0 6.831 13.58225
## 2 1 1.046 2.81800
# Create plot
email %>%
mutate(log_num_char = log(num_char)) %>%
ggplot(aes(x = factor(spam), y = log_num_char)) +
geom_boxplot()
# Create plot for spam and exclaim_mess
email %>% ggplot(aes(x=log(1 + exclaim_mess), fill=factor(spam))) + geom_density(alpha=0.5)
# Create plot of proportion of spam by image
email %>%
mutate(has_image = (image > 0)) %>%
ggplot(aes(x = has_image, fill = factor(spam))) +
geom_bar(position = "fill")
# Do images get counted as attachments?
sum(email$image > email$attach)
## [1] 0
# Question 1
email %>%
filter(dollar > 0) %>%
group_by(spam) %>%
summarize(mean(dollar))
## # A tibble: 2 × 2
## spam `mean(dollar)`
## <dbl> <dbl>
## 1 0 8.211078
## 2 1 3.435897
# Question 2
email %>%
filter(dollar > 10) %>%
ggplot(aes(x = factor(spam))) +
geom_bar()
# Reorder levels
email$number <- factor(email$number, levels=c("none", "small", "big"))
# Construct plot of number
ggplot(email, aes(x=number, fill=factor(spam))) +
geom_bar(position="fill")
Chapter 1 - Introduction to Ideas of Inference
Statistical inference is the process of making claims about a population based on information from a sample of data:
Randomized distributions:
Using the randomization distribution - comparing the observed statistic to the null distribution:
The sample being consistent with the null hypothesis does not “prove” the null hypothesis; you can only “reject” the null hypothesis
Example code includes:
# PROBLEM - I DO NOT HAVE oilabs::rep_sample_n() ; cut/paste to replicate as oilabs_rep_sample_n
# Copied code from https://github.com/OpenIntroOrg/oilabs/blob/master/R/rep_sample_n.R
oilabs_rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1) {
n <- nrow(tbl)
i <- unlist(replicate(reps, sample.int(n, size, replace = replace), simplify = FALSE))
rep_tbl <- cbind(replicate = rep(1:reps,rep(size,reps)), tbl[i,])
dplyr::group_by(rep_tbl, replicate)
}
And, then the actual coding:
data(NHANES, package="NHANES")
# What are the variables in the NHANES dataset?
names(NHANES)
## [1] "ID" "SurveyYr" "Gender"
## [4] "Age" "AgeDecade" "AgeMonths"
## [7] "Race1" "Race3" "Education"
## [10] "MaritalStatus" "HHIncome" "HHIncomeMid"
## [13] "Poverty" "HomeRooms" "HomeOwn"
## [16] "Work" "Weight" "Length"
## [19] "HeadCirc" "Height" "BMI"
## [22] "BMICatUnder20yrs" "BMI_WHO" "Pulse"
## [25] "BPSysAve" "BPDiaAve" "BPSys1"
## [28] "BPDia1" "BPSys2" "BPDia2"
## [31] "BPSys3" "BPDia3" "Testosterone"
## [34] "DirectChol" "TotChol" "UrineVol1"
## [37] "UrineFlow1" "UrineVol2" "UrineFlow2"
## [40] "Diabetes" "DiabetesAge" "HealthGen"
## [43] "DaysPhysHlthBad" "DaysMentHlthBad" "LittleInterest"
## [46] "Depressed" "nPregnancies" "nBabies"
## [49] "Age1stBaby" "SleepHrsNight" "SleepTrouble"
## [52] "PhysActive" "PhysActiveDays" "TVHrsDay"
## [55] "CompHrsDay" "TVHrsDayChild" "CompHrsDayChild"
## [58] "Alcohol12PlusYr" "AlcoholDay" "AlcoholYear"
## [61] "SmokeNow" "Smoke100" "Smoke100n"
## [64] "SmokeAge" "Marijuana" "AgeFirstMarij"
## [67] "RegularMarij" "AgeRegMarij" "HardDrugs"
## [70] "SexEver" "SexAge" "SexNumPartnLife"
## [73] "SexNumPartYear" "SameSex" "SexOrientation"
## [76] "PregnantNow"
# Create bar plot for Home Ownership by Gender
ggplot(NHANES, aes(x = Gender, fill = HomeOwn)) +
geom_bar(position = "fill") +
ylab("Relative frequencies")
# Density for SleepHrsNight colored by SleepTrouble, faceted by HealthGen
ggplot(NHANES, aes(x = SleepHrsNight, col = SleepTrouble)) +
geom_density(adjust = 2) +
facet_wrap(~ HealthGen)
## Warning: Removed 2245 rows containing non-finite values (stat_density).
# Subset the data: homes
homes <- NHANES %>%
select(Gender, HomeOwn) %>%
filter(HomeOwn %in% c("Own", "Rent"))
# Perform one permutation
homes %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own))
## # A tibble: 1 × 2
## diff_perm diff_orig
## <dbl> <dbl>
## 1 0.001644559 -0.007828723
# Perform 10 permutations
homeown_perm <- homes %>%
oilabs_rep_sample_n(size = nrow(homes), reps = 10) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)) # male - female
# Print differences to console
homeown_perm
## # A tibble: 10 × 3
## replicate diff_perm diff_orig
## <int> <dbl> <dbl>
## 1 1 -0.007416841 -0.007828723
## 2 2 0.023886176 -0.007828723
## 3 3 -0.005769314 -0.007828723
## 4 4 0.004939613 -0.007828723
## 5 5 0.005351495 -0.007828723
## 6 6 -0.008240605 -0.007828723
## 7 7 -0.006593078 -0.007828723
## 8 8 -0.001238614 -0.007828723
## 9 9 -0.020185177 -0.007828723
## 10 10 0.007822786 -0.007828723
# Dotplot of 10 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_dotplot(binwidth = 0.001)
# Perform 100 permutations
homeown_perm <- homes %>%
oilabs_rep_sample_n(nrow(homes), reps=100) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)) # male - female
# Dotplot of 100 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_dotplot(binwidth = 0.001)
# Perform 1000 permutations
homeown_perm <- homes %>%
oilabs_rep_sample_n(nrow(homes), reps=1000) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)) # male - female
# Density plot of 1000 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_density()
# Plot permuted differences
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_density() +
geom_vline(aes(xintercept = diff_orig),
col = "red")
# Compare permuted differences to observed difference
homeown_perm %>%
summarize(sum(diff_orig >= diff_perm))
## # A tibble: 1 × 1
## `sum(diff_orig >= diff_perm)`
## <int>
## 1 208
Chapter 2 - Completing a randomization study
Gender discrimination case - promotion case study among bank managers:
Distribution of statistics - different forms of the null hypothesis:
Why 0.05 for the critical region?
What is a p-value?
Example code includes:
discPromote <- "promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted"
discSex <- "male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female"
disc <- data.frame(promote=factor(strsplit(discPromote, " ; ")[[1]],
levels=c("not_promoted", "promoted")
),
sex=factor(strsplit(discSex, " ; ")[[1]])
)
# Create a contingency table summarizing the data
table(disc$sex, disc$promote)
##
## not_promoted promoted
## female 10 14
## male 3 21
# Find proportion of each sex who were promoted
disc %>%
group_by(sex) %>%
summarize(promoted_prop=mean(promote == "promoted"))
## # A tibble: 2 × 2
## sex promoted_prop
## <fctr> <dbl>
## 1 female 0.5833333
## 2 male 0.8750000
# Sample the entire data frame 5 times
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5)
## Source: local data frame [240 x 3]
## Groups: replicate [5]
##
## replicate promote sex
## * <int> <fctr> <fctr>
## 1 1 promoted male
## 2 1 promoted female
## 3 1 promoted male
## 4 1 promoted female
## 5 1 promoted female
## 6 1 promoted female
## 7 1 not_promoted male
## 8 1 not_promoted female
## 9 1 promoted male
## 10 1 not_promoted female
## # ... with 230 more rows
# Shuffle the promote variable within replicate
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote))
## Source: local data frame [240 x 4]
## Groups: replicate [5]
##
## replicate promote sex prom_perm
## <int> <fctr> <fctr> <fctr>
## 1 1 promoted female promoted
## 2 1 promoted male not_promoted
## 3 1 promoted female promoted
## 4 1 not_promoted female promoted
## 5 1 promoted female promoted
## 6 1 promoted male promoted
## 7 1 promoted male promoted
## 8 1 promoted female promoted
## 9 1 promoted male not_promoted
## 10 1 promoted male promoted
## # ... with 230 more rows
# Find the proportion of promoted in each replicate and sex
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted"))
## Source: local data frame [10 x 4]
## Groups: replicate [?]
##
## replicate sex prop_prom_perm prop_prom
## <int> <fctr> <dbl> <dbl>
## 1 1 female 0.7916667 0.5833333
## 2 1 male 0.6666667 0.8750000
## 3 2 female 0.6666667 0.5833333
## 4 2 male 0.7916667 0.8750000
## 5 3 female 0.6250000 0.5833333
## 6 3 male 0.8333333 0.8750000
## 7 4 female 0.6666667 0.5833333
## 8 4 male 0.7916667 0.8750000
## 9 5 female 0.7916667 0.5833333
## 10 5 male 0.6666667 0.8750000
# Difference in proportion of promoted across sex grouped by gender
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
## # A tibble: 5 × 3
## replicate diff_perm diff_orig
## <int> <dbl> <dbl>
## 1 1 0.12500000 0.2916667
## 2 2 -0.04166667 0.2916667
## 3 3 -0.20833333 0.2916667
## 4 4 -0.04166667 0.2916667
## 5 5 0.29166667 0.2916667
# Create a data frame of differences in promotion rates
disc_perm <- disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Histogram of permuted differences
ggplot(disc_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = "red")
# Find the 0.90, 0.95, and 0.99 quantiles of diff_perm
disc_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.1333333 0.2083333 0.2916667
# Find the 0.10, 0.05, and 0.01 quantiles of diff_perm
disc_perm %>%
summarize(q.01 = quantile(diff_perm, p = 0.01),
q.05 = quantile(diff_perm, p = 0.05),
q.10 = quantile(diff_perm, p = 0.10)
)
## # A tibble: 1 × 3
## q.01 q.05 q.10
## <dbl> <dbl> <dbl>
## 1 -0.2916667 -0.2083333 -0.125
discsmallSex <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 2 ; 1 ; 1 ; 1" # 2 is male
discbigSex <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1" # 2 is male
discbigPromote <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1" # 2 is promote
discsmallPromote <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1" # 2 is promote
dsSex <- factor(strsplit(discsmallSex, " ; ")[[1]],
labels=c("female", "male")
)
dbSex <- factor(strsplit(discbigSex, " ; ")[[1]],
labels=c("female", "male")
)
dsPromote <- factor(strsplit(discsmallPromote, " ; ")[[1]],
labels=c("not_promoted", "promoted")
)
dbPromote <- factor(strsplit(discbigPromote, " ; ")[[1]],
labels=c("not_promoted", "promoted")
)
disc_small <- data.frame(sex=dsSex, promote=dsPromote)
disc_big <- data.frame(sex=dbSex, promote=dbPromote)
# Tabulate the small and big data frames
disc_small %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 3 5
## male 1 7
disc_big %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 100 140
## male 30 210
# Create a 1000 permutation for each
disc_small_perm <- disc_small %>%
oilabs_rep_sample_n(size = nrow(disc_small), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Create a 1000 permutation for each
disc_big_perm <- disc_big %>%
oilabs_rep_sample_n(size = nrow(disc_big), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Plot the distributions of permuted differences
ggplot(disc_small_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = "red")
ggplot(disc_big_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = "red")
# Recall the quantiles associated with the original dataset
disc_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.1333333 0.2083333 0.2916667
# Calculate the quantiles associated with the small dataset
disc_small_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.25 0.25 0.5
# Calculate the quantiles associated with the big dataset
disc_big_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.05833333 0.06666667 0.09166667
# Calculate the p-value for the original dataset
disc_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.023
# Calculate the p-value for the small dataset
disc_small_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.291
# Calculate the p-value for the big dataset
disc_big_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0
dnPromote <- "promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted"
dnSex <- "male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; male ; male ; male ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female"
disc_new <- data.frame(promote=factor(strsplit(dnPromote, " ; ")[[1]],
levels=c("not_promoted", "promoted")
),
sex=factor(strsplit(dnSex, " ; ")[[1]])
)
# Create a 1000 permutation for each
disc_perm <- disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
disc_new_perm <- disc_new %>%
oilabs_rep_sample_n(size = nrow(disc_new), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Recall the original data
disc %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 10 14
## male 3 21
# Tabulate the new data
disc_new %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 7 17
## male 6 18
# Plot the distribution of the original permuted differences
ggplot(disc_perm, aes(x = diff_perm)) +
geom_histogram() +
geom_vline(aes(xintercept = diff_orig), col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot the distribution of the new permuted differences
ggplot(disc_new_perm, aes(x = diff_perm)) +
geom_histogram() +
geom_vline(aes(xintercept = diff_orig), col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Find the p-value from the original data
disc_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.025
# Find the p-value from the new data
disc_new_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.54
Chapter 3 - Hypothesis Testing Errors
Opportuinity cost - do reminders about saving money encourage students to purchase fewer DVDs? (Frederick et al study):
Errors and their consequences - consequences of various conclusions and associated errors:
Example code includes:
oppDec <- "buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD"
oppGroup <- "control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment"
opportunity <- data.frame(decision=factor(strsplit(oppDec, " ; ")[[1]]),
group=factor(strsplit(oppGroup, " ; ")[[1]])
)
# Tabulate the data
opportunity %>%
select(decision, group) %>%
table()
## group
## decision control treatment
## buyDVD 56 41
## nobuyDVD 19 34
# Find the proportion who bought the DVD in each group
opportunity %>%
group_by(group) %>%
summarize(buy_prop = mean(decision == "buyDVD"))
## # A tibble: 2 × 2
## group buy_prop
## <fctr> <dbl>
## 1 control 0.7466667
## 2 treatment 0.5466667
# Create a barplot
ggplot(opportunity, aes(x = group, fill = decision)) +
geom_bar(position="fill")
# Data frame of differences in purchase rates after permuting
opp_perm <- opportunity %>%
oilabs_rep_sample_n(size = nrow(opportunity), reps = 1000) %>%
mutate(dec_perm = sample(decision)) %>%
group_by(replicate, group) %>%
summarize(prop_buy_perm = mean(dec_perm == "buyDVD"),
prop_buy = mean(decision == "buyDVD")) %>%
summarize(diff_perm = diff(prop_buy_perm),
diff_orig = diff(prop_buy)) # treatment - control
# Histogram of permuted differences
ggplot(opp_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = .005) +
geom_vline(aes(xintercept = diff_orig), col = "red")
# Calculate the p-value
opp_perm %>%
summarize(mean(diff_perm <= diff_orig))
## # A tibble: 1 × 1
## `mean(diff_perm <= diff_orig)`
## <dbl>
## 1 0.01
# Calculate the two-sided p-value
opp_perm %>%
summarize(2*mean(diff_perm <= diff_orig))
## # A tibble: 1 × 1
## `2 * mean(diff_perm <= diff_orig)`
## <dbl>
## 1 0.02
Chapter 4 - Confidence Intervals
Parameters and confidence intervals - research questions can be comparative (hypothesis test) or estimation (confidence intervals):
Bootstrapping:
Variability in p-hat - how far are the sample data from the parameter?
Interpreting CI and technical conditions:
Example code includes:
# Do not have this dataset (30000 x 2 - poll-vote) - 30 votes in each of 1000 samples
voteSum <- c(9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)
voteN <- c(1, 7, 10, 27, 42, 90, 101, 143, 151, 136, 129, 79, 43, 25, 13, 3)
voteAll <- integer(0)
for (intCtr in seq_along(voteSum)) {
vecTemp <- rep(0L, 30)
vecTemp[seq_len(voteSum[intCtr])] <- 1L
voteAll <- c(voteAll, rep(vecTemp, times=voteN[intCtr]))
}
voteNum <- sample(1:1000, 1000, replace=FALSE)
# Needs to be a tibble since oilabs_rep_sample_n() has an implied drop=TRUE for data frames
all_polls <- tibble::as_tibble(data.frame(poll=rep(voteNum, each=30),
vote=voteAll
) %>% arrange(poll)
)
# Select one poll from which to resample: one_poll
one_poll <- all_polls %>%
filter(poll == 1) %>%
select(vote)
# Generate 1000 resamples of one_poll: one_poll_boot_30
one_poll_boot_30 <- one_poll %>%
oilabs_rep_sample_n(size = nrow(one_poll), replace = TRUE, reps = 1000)
# Compute p-hat for each poll: ex1_props
ex1_props <- all_polls %>%
group_by(poll) %>%
summarize(prop_yes = mean(vote))
# Compute p-hat* for each resampled poll: ex2_props
ex2_props <- one_poll_boot_30 %>%
group_by(replicate) %>%
summarize(prop_yes = mean(vote))
# Compare variability of p-hat and p-hat*
ex1_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08683128
ex2_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08874373
# Resample from one_poll with n = 3: one_poll_boot_3
one_poll_boot_3 <- one_poll %>%
oilabs_rep_sample_n(3, replace = TRUE, reps = 1000)
# Resample from one_poll with n = 300: one_poll_boot_300
one_poll_boot_300 <- one_poll %>%
oilabs_rep_sample_n(300, replace = TRUE, reps = 1000)
# Compute p-hat* for each resampled poll: ex3_props
ex3_props <- one_poll_boot_3 %>%
summarize(prop_yes = mean(vote))
# Compute p-hat* for each resampled poll: ex4_props
ex4_props <- one_poll_boot_300 %>%
summarize(prop_yes = mean(vote))
# Compare variability of p-hat* for n = 3 vs. n = 300
ex3_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.2964122
ex4_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.02826066
# Recall the variability of sample proportions
ex1_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08683128
ex2_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08874373
ex3_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.2964122
ex4_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.02826066
# Create smoothed density curves for all four experiments
ggplot() +
geom_density(data = ex1_props, aes(x = prop_yes), col = "black", bw = .1) +
geom_density(data = ex2_props, aes(x = prop_yes), col = "green", bw = .1) +
geom_density(data = ex3_props, aes(x = prop_yes), col = "red", bw = .1) +
geom_density(data = ex4_props, aes(x = prop_yes), col = "blue", bw = .1)
# Compute proportion of votes for Candidate X: props
props <- all_polls %>%
group_by(poll) %>%
summarize(prop_yes = mean(vote))
# Proportion of polls within 2SE
props %>%
mutate(lower = mean(prop_yes) - 2 * sd(prop_yes),
upper = mean(prop_yes) + 2 * sd(prop_yes),
in_CI = prop_yes > lower & prop_yes < upper) %>%
summarize(mean(in_CI))
## # A tibble: 1 × 1
## `mean(in_CI)`
## <dbl>
## 1 0.966
# Again, set the one sample that was collected
one_poll <- all_polls %>%
filter(poll == 1) %>%
select(vote)
# Compute p-hat from one_poll: p_hat
p_hat <- mean(one_poll$vote)
# Bootstrap to find the SE of p-hat: one_poll_boot
one_poll_boot <- one_poll %>%
oilabs_rep_sample_n(30, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Create an interval of plausible values
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.3138709 0.6861291
# Find the 2.5% and 97.5% of the p-hat values
one_poll_boot %>%
summarize(q025_prop = quantile(prop_yes_boot, p = 0.025),
q975_prop = quantile(prop_yes_boot, p = 0.975))
## # A tibble: 1 × 2
## q025_prop q975_prop
## <dbl> <dbl>
## 1 0.3333333 0.6666667
# Bootstrap t-confidence interval for comparison
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.3138709 0.6861291
# Recall the bootstrap t-confidence interval
p_hat <- mean(one_poll$vote)
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot))
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.3138709 0.6861291
# Collect a sample of 30 observations from the population
one_poll <- as.tbl(data.frame(vote = rbinom(n = 30, 1, .6)))
# Resample the data using samples of size 300 (an incorrect strategy!)
one_poll_boot_300 <- one_poll %>%
oilabs_rep_sample_n(size=300, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Find the endpoints of the the bootstrap t-confidence interval
one_poll_boot_300 %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.4497326 0.5502674
# Resample the data using samples of size 3 (an incorrect strategy!)
one_poll_boot_3 <- one_poll %>%
oilabs_rep_sample_n(size=3, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Find the endpoints of the the bootstrap t-confidence interval
one_poll_boot_3 %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 -0.01919018 1.01919
# Collect 30 observations from a population with true proportion of 0.8
one_poll <- as.tbl(data.frame(vote = rbinom(n = 30, size = 1, prob = 0.8)))
# Compute p-hat of new sample: p_hat
p_hat <- mean(one_poll$vote)
# Resample the 30 observations (with replacement)
one_poll_boot <- one_poll %>%
oilabs_rep_sample_n(size=nrow(one_poll), replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Calculate the bootstrap t-confidence interval
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.6534714 0.9465286
# Calculate a 95% bootstrap percentile interval
one_poll_boot %>%
summarize(q025_prop = quantile(prop_yes_boot, p = 0.025),
q975_prop = quantile(prop_yes_boot, p = 0.975))
## # A tibble: 1 × 2
## q025_prop q975_prop
## <dbl> <dbl>
## 1 0.6658333 0.9333333
# Calculate a 99% bootstrap percentile interval
one_poll_boot %>%
summarize(q005_prop = quantile(prop_yes_boot, p = 0.005),
q995_prop = quantile(prop_yes_boot, p = 0.995))
## # A tibble: 1 × 2
## q005_prop q995_prop
## <dbl> <dbl>
## 1 0.5666667 0.9666667
# Calculate a 90% bootstrap percentile interval
one_poll_boot %>%
summarize(q05_prop = quantile(prop_yes_boot, p = 0.05),
q95_prop = quantile(prop_yes_boot, p = 0.95))
## # A tibble: 1 × 2
## q05_prop q95_prop
## <dbl> <dbl>
## 1 0.6666667 0.9
Chapter 1 - Correlation and Regression
Modeling bivariate relationships - relationships between two variables:
Characterizing bivariate relationships:
Outliers - points that do not fit with the rest of the data:
Example code includes:
data(ncbirths, package="openintro")
# Scatterplot of weight vs. weeks
ggplot(ncbirths, aes(x=weeks, y=weight)) +
geom_point()
## Warning: Removed 2 rows containing missing values (geom_point).
# Boxplot of weight vs. weeks
ggplot(data = ncbirths,
aes(x = cut(weeks, breaks = 5), y = weight)) +
geom_boxplot()
# Mammals scatterplot
data(mammals, package="openintro")
ggplot(mammals, aes(x=BodyWt, y=BrainWt)) +
geom_point()
# Baseball player scatterplot
data(mlbBat10, package="openintro")
ggplot(mlbBat10, aes(x=OBP, y=SLG)) +
geom_point()
# Body dimensions scatterplot
data(bdims, package="openintro")
ggplot(bdims, aes(x=hgt, y=wgt, color=factor(sex))) +
geom_point()
# Smoking scatterplot
data(smoking, package="openintro")
ggplot(smoking, aes(x=age, y=amtWeekdays)) +
geom_point()
## Warning: Removed 1270 rows containing missing values (geom_point).
# Scatterplot with coord_trans()
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
geom_point() +
coord_trans(x = "log10", y = "log10")
# Scatterplot with scale_x_log10() and scale_y_log10()
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
geom_point() +
scale_x_log10() + scale_y_log10()
# Scatterplot of SLG vs. OBP
mlbBat10 %>%
filter(AB >= 200) %>%
ggplot(aes(x = OBP, y = SLG)) +
geom_point()
# Identify the outlying player
mlbBat10 %>%
filter(AB >= 200, OBP < 0.2)
## name team position G AB R H 2B 3B HR RBI TB BB SO SB CS OBP
## 1 B Wood LAA 3B 81 226 20 33 2 0 4 14 47 6 71 1 0 0.174
## SLG AVG
## 1 0.208 0.146
Chapter 2 - Correlation
Quantifying strength of bivariate relationship - correlation:
Anscombe dataset - synthetic datasets of the problems with correlation (and regression):
Interpretation of correlation - correlation is not causality:
Spurious correlation:
Example code includes:
data(ncbirths, package="openintro")
# Compute correlation
ncbirths %>%
summarize(N = n(), r = cor(weight, mage))
## N r
## 1 1000 0.05506589
# Compute correlation for all non-missing pairs
ncbirths %>%
summarize(N = n(), r = cor(weight, weeks, use = "pairwise.complete.obs"))
## N r
## 1 1000 0.6701013
data(anscombe)
Anscombe <- data.frame(x=as.vector(as.matrix(anscombe[,1:4])),
y=as.vector(as.matrix(anscombe[,5:8])),
id=rep(1:11, times=4),
set=rep(1:4, each=11)
)
ggplot(data = Anscombe, aes(x = x, y = y)) +
geom_point() +
facet_wrap(~ set)
# Compute properties of Anscombe
Anscombe %>%
group_by(set) %>%
summarize(N = n(), mean(x), sd(x), mean(y), sd(y), cor(x, y))
## # A tibble: 4 × 7
## set N `mean(x)` `sd(x)` `mean(y)` `sd(y)` `cor(x, y)`
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 11 9 3.316625 7.500909 2.031568 0.8164205
## 2 2 11 9 3.316625 7.500909 2.031657 0.8162365
## 3 3 11 9 3.316625 7.500000 2.030424 0.8162867
## 4 4 11 9 3.316625 7.500909 2.030579 0.8165214
data(mlbBat10, package="openintro")
data(mammals, package="openintro")
data(bdims, package="openintro")
# Correlation for all baseball players
mlbBat10 %>%
summarize(N = n(), r = cor(OBP, SLG))
## N r
## 1 1199 0.8145628
# Correlation for all players with at least 200 ABs
mlbBat10 %>%
filter(AB >= 200) %>%
summarize(N = n(), r = cor(OBP, SLG))
## N r
## 1 329 0.6855364
# Correlation of body dimensions
bdims %>%
group_by(sex) %>%
summarize(N = n(), r = cor(hgt, wgt))
## # A tibble: 2 × 3
## sex N r
## <int> <int> <dbl>
## 1 0 260 0.4310593
## 2 1 247 0.5347418
# Correlation among mammals, with and without log
mammals %>%
summarize(N = n(),
r = cor(BodyWt, BrainWt),
r_log = cor(log(BodyWt), log(BrainWt)))
## N r r_log
## 1 62 0.9341638 0.9595748
# Create a random noise dataset
noise <- data.frame(x=rnorm(1000), y=rnorm(1000), z=rep(1:20, each=50))
# Create faceted scatterplot
noise %>%
ggplot(aes(x=x, y=y)) +
geom_point() +
facet_wrap(~ z)
# Compute correlations for each dataset
noise_summary <- noise %>%
group_by(z) %>%
summarize(N = n(), spurious_cor = cor(x, y))
# Isolate sets with correlations above 0.2 in absolute strength
noise_summary %>%
filter(abs(spurious_cor) > 0.2)
## # A tibble: 2 × 3
## z N spurious_cor
## <int> <int> <dbl>
## 1 17 50 -0.2418963
## 2 18 50 -0.2696328
Chapter 3 - Simple Linear Regression
Visualization of linear models - adjusting the intercept and the slope to best fit the data:
Understanding the linear model: Response = f(Explanatory) + Noise:
Regression vs. regression to the mean (Galton):
Example code includes:
# Scatterplot with regression line
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
bdims_summary <- bdims %>%
summarize(N=n(), r=cor(hgt, wgt),
mean_hgt=mean(hgt), sd_hgt=sd(hgt),
mean_wgt=mean(wgt), sd_wgt=sd(wgt)
)
# Print bdims_summary
bdims_summary
## N r mean_hgt sd_hgt mean_wgt sd_wgt
## 1 507 0.7173011 171.1438 9.407205 69.14753 13.34576
# Add slope and intercept
bdims_summary %>%
mutate(slope = r * sd_wgt / sd_hgt,
intercept = mean_wgt - slope*mean_hgt
)
## N r mean_hgt sd_hgt mean_wgt sd_wgt slope intercept
## 1 507 0.7173011 171.1438 9.407205 69.14753 13.34576 1.017617 -105.0113
data(GaltonFamilies, package="HistData")
GaltonUse <- GaltonFamilies %>%
mutate(sex=gender, height=childHeight) %>%
select(family, father, mother, sex, height)
GaltonUse <- GaltonUse %>%
left_join(GaltonUse %>% group_by(family) %>% summarize(nkids=n()), by="family")
Galton_women <- GaltonUse %>%
filter(sex=="female")
Galton_men <- GaltonUse %>%
filter(sex=="male")
# Height of children vs. height of father
ggplot(data = Galton_men, aes(x = father, y = height)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method = "lm", se = FALSE)
# Height of children vs. height of mother
ggplot(data = Galton_women, aes(x = mother, y = height)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method = "lm", se = FALSE)
Chapter 4 - Interpreting Regression Models
Interpretation of regression coefficients - UCLA textbook pricing (dataset ‘textbooks’):
Linear model object interpretation:
Using the linear model - residuals can give information about biggest outliers (often interesting):
Example code includes:
# Linear model for weight as a function of height
lm(wgt ~ hgt, data = bdims)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Coefficients:
## (Intercept) hgt
## -105.011 1.018
# Linear model for SLG as a function of OBP
lm(SLG ~ OBP, data=mlbBat10)
##
## Call:
## lm(formula = SLG ~ OBP, data = mlbBat10)
##
## Coefficients:
## (Intercept) OBP
## 0.009407 1.110323
# Log-linear model for body weight as a function of brain weight
lm(log(BodyWt) ~ log(BrainWt), data=mammals)
##
## Call:
## lm(formula = log(BodyWt) ~ log(BrainWt), data = mammals)
##
## Coefficients:
## (Intercept) log(BrainWt)
## -2.509 1.225
mod <- lm(wgt ~ hgt, data = bdims)
# Show the coefficients
coef(mod)
## (Intercept) hgt
## -105.011254 1.017617
# Show the full output
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
# Mean of weights equal to mean of fitted values?
mean(bdims$wgt) == mean(fitted.values(mod))
## [1] TRUE
# Mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15
# Create bdims_tidy
bdims_tidy <- broom::augment(mod)
# Glimpse the resulting data frame
glimpse(bdims_tidy)
## Observations: 507
## Variables: 9
## $ wgt <dbl> 65.6, 71.8, 80.7, 72.6, 78.8, 74.8, 86.4, 78.4, 62....
## $ hgt <dbl> 174.0, 175.3, 193.5, 186.5, 187.2, 181.5, 184.0, 18...
## $ .fitted <dbl> 72.05406, 73.37697, 91.89759, 84.77427, 85.48661, 7...
## $ .se.fit <dbl> 0.4320546, 0.4520060, 1.0667332, 0.7919264, 0.81834...
## $ .resid <dbl> -6.4540648, -1.5769666, -11.1975919, -12.1742745, -...
## $ .hat <dbl> 0.002154570, 0.002358152, 0.013133942, 0.007238576,...
## $ .sigma <dbl> 9.312824, 9.317005, 9.303732, 9.301360, 9.312471, 9...
## $ .cooksd <dbl> 5.201807e-04, 3.400330e-05, 9.758463e-03, 6.282074e...
## $ .std.resid <dbl> -0.69413418, -0.16961994, -1.21098084, -1.31269063,...
ben <- data.frame(wgt=74.8, hgt=182.8)
# Print ben
ben
## wgt hgt
## 1 74.8 182.8
# Predict the weight of ben
predict(mod, newdata=ben)
## 1
## 81.00909
# Add the line to the scatterplot
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_abline(data = as.data.frame(t(coef(mod))),
aes(intercept = `(Intercept)`, slope = hgt),
color = "dodgerblue")
Chapter 5 - Model Fit
Assessing model fit - how well does the regression line fit the underlying data?
Comparing model fits:
Unusual points - leverage and influence:
Dealing with unusual points - managing the impacts of leverage and influence:
Example code includes:
mod <- lm(wgt ~ hgt, data = bdims)
# View summary of model
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
# Compute the mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15
# Compute RMSE
sqrt(sum(residuals(mod)^2) / df.residual(mod))
## [1] 9.30804
# View model summary
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
bdims_tidy <- broom::augment(mod)
# Compute R-squared
bdims_tidy %>%
summarize(var_y = var(wgt), var_e = var(.resid)) %>%
mutate(R_squared = 1 - var_e/var_y)
## var_y var_e R_squared
## 1 178.1094 86.46839 0.5145208
mod <- lm(SLG ~ OBP, data=filter(mlbBat10, AB >= 10))
# Rank points of high leverage
mod %>%
broom::augment() %>%
arrange(desc(.hat)) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 2 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 3 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 4 0.308 0.550 0.69049108 0.009158810 -0.38249108 0.01641049 0.07011360
## 5 0.000 0.037 0.01152451 0.008770891 -0.01152451 0.01504981 0.07154283
## 6 0.038 0.038 0.01284803 0.008739031 0.02515197 0.01494067 0.07153800
## .cooksd .std.resid
## 1 0.0027664282 0.5289049
## 2 0.0027664282 0.5289049
## 3 0.0027664282 0.5289049
## 4 0.2427446800 -5.3943121
## 5 0.0002015398 -0.1624191
## 6 0.0009528017 0.3544561
# Rank influential points
mod %>%
broom::augment() %>%
arrange(desc(.cooksd)) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.308 0.550 0.69049108 0.009158810 -0.3824911 0.016410487 0.07011360
## 2 0.833 0.385 0.47211002 0.004190644 0.3608900 0.003435619 0.07028875
## 3 0.800 0.455 0.56475653 0.006186785 0.2352435 0.007488132 0.07101125
## 4 0.379 0.133 0.13858258 0.005792344 0.2404174 0.006563752 0.07098798
## 5 0.786 0.438 0.54225666 0.005678026 0.2437433 0.006307223 0.07097257
## 6 0.231 0.077 0.06446537 0.007506974 0.1665346 0.011024863 0.07127661
## .cooksd .std.resid
## 1 0.24274468 -5.394312
## 2 0.04407145 5.056428
## 3 0.04114818 3.302718
## 4 0.03760256 3.373787
## 5 0.03712042 3.420018
## 6 0.03057912 2.342252
# Create nontrivial_players
nontrivial_players <- filter(mlbBat10, AB >= 10 & OBP < 0.5)
# Fit model to new data
mod_cleaner <- lm(SLG ~ OBP, data=nontrivial_players)
# View model summary
summary(mod_cleaner)
##
## Call:
## lm(formula = SLG ~ OBP, data = nontrivial_players)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31383 -0.04165 -0.00261 0.03992 0.35819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.043326 0.009823 -4.411 1.18e-05 ***
## OBP 1.345816 0.033012 40.768 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07011 on 734 degrees of freedom
## Multiple R-squared: 0.6937, Adjusted R-squared: 0.6932
## F-statistic: 1662 on 1 and 734 DF, p-value: < 2.2e-16
# Visualize new model
ggplot(nontrivial_players, aes(x=OBP, y=SLG)) +
geom_point() +
geom_abline(data = as.data.frame(t(coef(mod_cleaner))),
aes(intercept = `(Intercept)`, slope = OBP),
color = "dodgerblue")
# Visualize new model
ggplot(nontrivial_players, aes(x=OBP, y=SLG)) +
geom_point() +
geom_smooth(method="lm")
# Rank high leverage points
mod %>%
broom::augment() %>%
arrange(desc(.hat), .cooksd) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 2 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 3 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 4 0.308 0.550 0.69049108 0.009158810 -0.38249108 0.01641049 0.07011360
## 5 0.000 0.037 0.01152451 0.008770891 -0.01152451 0.01504981 0.07154283
## 6 0.038 0.038 0.01284803 0.008739031 0.02515197 0.01494067 0.07153800
## .cooksd .std.resid
## 1 0.0027664282 0.5289049
## 2 0.0027664282 0.5289049
## 3 0.0027664282 0.5289049
## 4 0.2427446800 -5.3943121
## 5 0.0002015398 -0.1624191
## 6 0.0009528017 0.3544561
Chapter 1 - What is statistical modeling?
Statistical models are summaries of data (can be encapsulations, machine learning, etc.):
R objects for statistical modeling - functions, formulae, and data frames:
Example code includes:
# Copy over the function and its core expression
# .expression <- (100 - 5 * (1 - relig_motivation) * (school == "private")) * 1.15^acad_motivation
test_scores <-function(school = "private", acad_motivation = 0, relig_motivation = 0) {
# eval(.expression)
(100 - 5 * (1 - relig_motivation) * (school == "private")) * 1.15^acad_motivation
}
# Baseline run
test_scores(school = "public", acad_motivation = 0, relig_motivation = 0)
## [1] 100
# Change school input, leaving others at baseline
test_scores(school = "private", acad_motivation = 0, relig_motivation = 0)
## [1] 95
# Change acad_motivation input, leaving others at baseline
test_scores(school = "public", acad_motivation = 1, relig_motivation = 0)
## [1] 115
# Change relig_motivation input, leaving others at baseline
test_scores(school = "public", acad_motivation = 0, relig_motivation = 1)
## [1] 100
# Use results above to estimate output for new inputs
my_prediction <- 100 - 5 + (2 * 0) + (2 * 15)
my_prediction
## [1] 125
# Check prediction by using test_scores() directly
test_scores(school = "private", acad_motivation = 2, relig_motivation = 2)
## [1] 138.8625
# Use data() to load Trucking_jobs
data(Trucking_jobs, package="statisticalModeling")
# View the number rows in Trucking_jobs
nrow(Trucking_jobs)
## [1] 129
# Use names() to find variable names in mosaicData::Riders
names(mosaicData::Riders)
## [1] "date" "day" "highT" "lowT" "hi" "lo" "precip"
## [8] "clouds" "riders" "ct" "weekday" "wday"
# Look at the head() of diamonds
head(ggplot2::diamonds)
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
mean_ <- mosaic::mean_
data(AARP, package="statisticalModeling")
# Find the variable names in AARP
names(AARP)
## [1] "Age" "Sex" "Coverage" "Cost"
# Find the mean cost broken down by sex
mosaic::mean(Cost ~ Sex, data = AARP)
## F M
## 47.29778 57.53056
# Create a boxplot using base, lattice, or ggplot2
boxplot(Cost ~ Sex, data=AARP)
# Make a scatterplot using base, lattice, or ggplot2
plot(Cost ~ Age, data=AARP)
Chapter 2 - Designing and Training Models
Modeling is a process rather than a result:
Evaluating models are assessing how well they match to the real-world (underlying data):
Example code includes:
data(Runners, package="statisticalModeling")
# Find the variable names in Runners
names(Runners)
## [1] "age" "net" "gun" "sex"
## [5] "year" "previous" "nruns" "start_position"
# Build models: handicap_model_1, handicap_model_2, handicap_model_3
handicap_model_1 <- lm(net ~ age, data = Runners)
handicap_model_2 <- lm(net ~ sex, data = Runners)
handicap_model_3 <- lm(net ~ age + sex, data = Runners)
# For now, here's a way to visualize the models
statisticalModeling::fmodel(handicap_model_1)
statisticalModeling::fmodel(handicap_model_2)
statisticalModeling::fmodel(handicap_model_3)
# Build rpart model: model_2
model_2 <- rpart::rpart(net ~ age + sex, data=Runners, cp=0.002)
# Examine graph of model_2 (don't change)
statisticalModeling::fmodel(model_2, ~ age + sex)
# DO NOT HAVE THIS DATASET!
# Create run_again_model
# run_again_model <- rpart(runs_again ~ age + sex + net, data=Ran_twice, cp=0.005)
# Visualize the model (don't change)
# fmodel(run_again_model, ~ age + net, data = Ran_twice)
data(AARP, package="statisticalModeling")
# Display the variable names in the AARP data frame
names(AARP)
## [1] "Age" "Sex" "Coverage" "Cost"
# Build a model: insurance_cost_model
insurance_cost_model <- lm(Cost ~ Age + Sex + Coverage, data=AARP)
# Construct a data frame: example_vals
example_vals <- data.frame(Age=60, Sex="F", Coverage=200)
# Predict insurance cost using predict()
predict(insurance_cost_model, newdata=example_vals)
## 1
## 363.637
# Calculate model output using evaluate_model()
statisticalModeling::evaluate_model(insurance_cost_model, data=example_vals)
## Age Sex Coverage model_output
## 1 60 F 200 363.637
# Build a model: insurance_cost_model
insurance_cost_model <- lm(Cost ~ Age + Sex + Coverage, data = AARP)
# Create a data frame: new_inputs_1
new_inputs_1 <- data.frame(Age = c(30, 90), Sex = c("F", "M"),
Coverage = c(0, 100)
)
# Use expand.grid(): new_inputs_2
new_inputs_2 <- expand.grid(Age = c(30, 90), Sex = c("F", "M"),
Coverage = c(0, 100)
)
# Use predict() for new_inputs_1 and new_inputs_2
predict(insurance_cost_model, newdata = new_inputs_1)
## 1 2
## -99.98726 292.88435
predict(insurance_cost_model, newdata = new_inputs_2)
## 1 2 3 4 5 6 7
## -99.98726 101.11503 -89.75448 111.34781 81.54928 282.65157 91.78206
## 8
## 292.88435
# Use evaluate_model() for new_inputs_1 and new_inputs_2
statisticalModeling::evaluate_model(insurance_cost_model, data = new_inputs_1)
## Age Sex Coverage model_output
## 1 30 F 0 -99.98726
## 2 90 M 100 292.88435
statisticalModeling::evaluate_model(insurance_cost_model, data = new_inputs_2)
## Age Sex Coverage model_output
## 1 30 F 0 -99.98726
## 2 90 F 0 101.11503
## 3 30 M 0 -89.75448
## 4 90 M 0 111.34781
## 5 30 F 100 81.54928
## 6 90 F 100 282.65157
## 7 30 M 100 91.78206
## 8 90 M 100 292.88435
# Evaluate insurance_cost_model
statisticalModeling::evaluate_model(insurance_cost_model)
## Age Sex Coverage model_output
## 1 40 F 0 -66.4702087
## 2 60 F 0 0.5638866
## 3 80 F 0 67.5979818
## 4 40 M 0 -56.2374309
## 5 60 M 0 10.7966643
## 6 80 M 0 77.8307596
## 7 40 F 50 24.2980606
## 8 60 F 50 91.3321558
## 9 80 F 50 158.3662510
## 10 40 M 50 34.5308383
## 11 60 M 50 101.5649336
## 12 80 M 50 168.5990288
# Use fmodel() to reproduce the graphic
statisticalModeling::fmodel(insurance_cost_model, ~ Coverage + Age + Sex)
# A new formula to highlight difference in sexes
new_formula <- ~ Coverage + Sex + Age
# Make the new plot (don't change)
statisticalModeling::fmodel(insurance_cost_model, new_formula)
Chapter 3 - Assessing Prediction Performance
Choosing explanatory variables - depends on the intended purpose for the statistical model:
Cross validation - divide the data in to two non-overlapping datasets, train and test:
Example code includes:
runIDs <- c( 5035 , 10 , 9271 , 256 , 1175 , 17334 , 1571 , 5264 , 15985 , 2237 , 3178 , 7999 , 16462 , 15443 , 13318 , 10409 , 8741 , 5998 , 2860 , 8710 , 3695 , 12340 , 6598 , 6354 , 1125 , 8759 , 7238 , 294 , 2268 , 7219 , 9154 , 5940 , 7464 , 3669 , 14729 , 11636 , 5018 , 1877 , 4639 , 1049 , 4484 , 3896 , 8944 , 11838 , 5960 , 15648 , 11552 , 250 , 9584 , 15110 , 9106 , 10824 , 7706 , 5653 , 4018 , 8028 , 7468 , 14766 , 2945 , 10805 , 2439 , 13616 , 3151 , 10493 , 13595 , 3308 , 1038 , 9019 , 3477 , 11211 , 12410 , 7697 , 7709 , 3699 , 16979 , 9688 , 4891 , 6010 , 6582 , 3983 , 920 , 8972 , 9185 , 4265 , 14708 , 7575 , 3459 , 11727 , 14696 , 4075 , 6604 , 13815 , 260 , 8606 , 14643 , 4323 , 13826 , 3487 , 10602 , 4029 )
runAge <- c( 54 , 27 , 24 , 39 , 52 , 28 , 33 , 40 , 32 , 33 , 30 , 58 , 33 , 46 , 34 , 35 , 50 , 60 , 30 , 28 , 30 , 29 , 56 , 43 , 62 , 60 , 37 , 48 , 27 , 32 , 53 , 43 , 41 , 33 , 29 , 49 , 29 , 24 , 45 , 34 , 56 , 51 , 41 , 38 , 33 , 29 , 34 , 31 , 35 , 43 , 29 , 30 , 30 , 33 , 33 , 46 , 45 , 51 , 32 , 44 , 37 , 46 , 28 , 31 , 51 , 40 , 44 , 28 , 48 , 28 , 44 , 58 , 27 , 33 , 42 , 45 , 36 , 37 , 26 , 47 , 39 , 38 , 36 , 66 , 50 , 31 , 34 , 26 , 53 , 44 , 45 , 24 , 33 , 34 , 50 , 31 , 54 , 38 , 31 , 40 )
runNet <- c( 90 , 74.22 , 90.85 , 91.7 , 94.13 , 99.13 , 78.98 , 102.6 , 111.6 , 100.9 , 81.37 , 82.63 , 83.32 , 71.17 , 73.62 , 79.32 , 111.5 , 86.62 , 111.3 , 69.7 , 66.5 , 65.52 , 99.38 , 89.52 , 76.23 , 79.2 , 59.88 , 124.5 , 107.5 , 105.5 , 78.1 , 99.22 , 96.68 , 59.25 , 94.75 , 93.45 , 76.15 , 91.53 , 75.07 , 80.9 , 94.18 , 97.57 , 86.73 , 92.77 , 99.67 , 85.38 , 65.97 , 77.38 , 94.42 , 78.92 , 87.03 , 97.78 , 86.82 , 113.1 , 88.58 , 74.05 , 88.52 , 83.73 , 81.4 , 69 , 78.43 , 101.2 , 81.2 , 84.45 , 105.1 , 70.38 , 83.28 , 106.5 , 79.12 , 69.83 , 73.35 , 66.07 , 86.23 , 76.72 , 91.88 , 79.12 , 81.63 , 79.67 , 86.62 , 71.63 , 99.28 , 90.58 , 101.2 , 95.8 , 77.58 , 102.4 , 79.67 , 111.2 , 76.88 , 104.4 , 117.4 , 86.68 , 94.78 , 86.1 , 79.63 , 79.23 , 94.97 , 85.67 , 97.07 , 83.15 )
runGun <- c( 90.28 , 75.08 , 93.55 , 95.18 , 99.4 , 105.6 , 81.5 , 107.8 , 116.6 , 104.6 , 82.18 , 82.95 , 84.32 , 71.32 , 74.68 , 80.52 , 114.8 , 87.05 , 115.6 , 70.17 , 66.75 , 66.07 , 105.2 , 95.63 , 81.27 , 80.13 , 60.02 , 125.1 , 107.5 , 110 , 78.53 , 109.6 , 102.5 , 59.43 , 101.1 , 100.3 , 76.47 , 96.98 , 76.43 , 82.45 , 97.8 , 103.6 , 89.53 , 93.63 , 104.5 , 89.73 , 66.25 , 78.62 , 99.47 , 79.15 , 91.13 , 105.4 , 89.85 , 117.8 , 89.45 , 74.93 , 89.2 , 87.32 , 87.9 , 69.13 , 79.97 , 111 , 84.5 , 85.55 , 110.5 , 74.15 , 83.58 , 114.7 , 79.62 , 70.42 , 73.85 , 66.3 , 92.37 , 77.53 , 98.77 , 79.65 , 85.17 , 85.67 , 92.68 , 72.15 , 107.6 , 96.18 , 103.4 , 99.55 , 78.85 , 107 , 81.42 , 114.4 , 77.85 , 108.5 , 121.7 , 92.68 , 96.87 , 88.08 , 80.43 , 79.93 , 99.3 , 90.47 , 102.3 , 84.75 )
runSex <- c( 'F' , 'M' , 'F' , 'F' , 'M' , 'M' , 'M' , 'F' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'M' , 'M' , 'F' , 'M' , 'M' , 'M' , 'F' , 'M' , 'F' , 'M' , 'M' , 'F' , 'F' , 'F' , 'F' , 'F' , 'M' , 'M' , 'F' , 'M' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'F' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'M' , 'F' , 'M' , 'F' , 'M' , 'M' , 'M' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'M' , 'M' , 'M' , 'F' , 'F' , 'F' , 'M' , 'M' , 'F' , 'M' , 'F' , 'M' , 'F' , 'F' , 'M' , 'F' , 'M' , 'M' , 'F' , 'M' , 'M' , 'F' , 'M' )
runYear <- c( 2004 , 2001 , 2000 , 2004 , 2005 , 2003 , 2002 , 2001 , 2004 , 2005 , 2005 , 2005 , 2002 , 2004 , 2003 , 2005 , 2005 , 2002 , 2006 , 2006 , 2005 , 2003 , 2004 , 2003 , 2003 , 2003 , 2003 , 2006 , 2004 , 2002 , 2005 , 2006 , 2004 , 2005 , 2004 , 2002 , 2002 , 2004 , 2004 , 2002 , 2001 , 2004 , 2001 , 2002 , 2003 , 2005 , 2004 , 2001 , 2005 , 2003 , 2004 , 2004 , 2003 , 2002 , 2005 , 2002 , 2000 , 2001 , 2005 , 2006 , 2004 , 2006 , 2000 , 2004 , 2002 , 2002 , 2004 , 2006 , 2004 , 2002 , 2005 , 2000 , 2005 , 2003 , 2004 , 2003 , 2005 , 2003 , 2005 , 2004 , 2005 , 2001 , 2000 , 2000 , 2001 , 2002 , 2005 , 2004 , 2006 , 2001 , 2005 , 2005 , 2003 , 2001 , 2005 , 2000 , 2002 , 2004 , 2004 , 2006 )
runPrevious <- c( 5 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 2 , 2 , 4 , 5 , 0 , 5 , 1 , 0 , 3 , 3 , 0 , 2 , 1 , 0 , 1 , 1 , 4 , 1 , 0 , 4 , 2 , 1 , 4 , 1 , 1 , 4 , 1 , 1 , 1 , 1 , 0 , 2 , 2 , 1 , 1 , 1 , 0 , 2 , 2 , 2 , 2 , 1 , 2 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 3 , 1 , 1 , 1 , 1 , 3 , 2 , 1 , 5 , 1 , 5 , 0 , 6 , 1 , 1 , 2 , 2 , 1 , 3 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 2 , 1 , 1 , 1 , 0 , 1 , 3 , 1 , 0 , 1 , 0 , 1 , 0 , 3 , 1 , 4 )
runNRuns <- c( 9 , 8 , 4 , 3 , 4 , 5 , 4 , 6 , 3 , 4 , 6 , 6 , 4 , 8 , 4 , 3 , 7 , 8 , 3 , 4 , 3 , 4 , 6 , 4 , 5 , 3 , 3 , 5 , 4 , 4 , 6 , 4 , 5 , 6 , 4 , 4 , 3 , 3 , 5 , 8 , 7 , 5 , 8 , 3 , 3 , 4 , 5 , 5 , 3 , 5 , 3 , 4 , 4 , 3 , 3 , 3 , 4 , 3 , 5 , 4 , 4 , 4 , 5 , 6 , 5 , 3 , 10 , 4 , 9 , 5 , 7 , 3 , 4 , 5 , 4 , 4 , 6 , 5 , 4 , 3 , 3 , 3 , 9 , 6 , 3 , 3 , 3 , 4 , 3 , 7 , 4 , 3 , 5 , 6 , 3 , 4 , 3 , 4 , 3 , 6 )
runStart_Position <- c( 'eager' , 'eager' , 'calm' , 'mellow' , 'mellow' , 'mellow' , 'calm' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'eager' , 'eager' , 'eager' , 'calm' , 'calm' , 'mellow' , 'eager' , 'mellow' , 'eager' , 'eager' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'eager' , 'eager' , 'eager' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'calm' , 'calm' , 'mellow' , 'mellow' , 'calm' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'calm' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'eager' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'calm' , 'mellow' , 'mellow' , 'calm' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'eager' , 'eager' , 'eager' , 'eager' , 'mellow' , 'eager' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'calm' , 'mellow' , 'calm' , 'mellow' , 'calm' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'calm' , 'calm' , 'eager' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'calm' )
Runners_100 <- data.frame(age=as.integer(runAge),
net=runNet,
gun=runGun,
sex=runSex,
year=as.integer(runYear),
previous=as.integer(runPrevious),
nruns=as.integer(runNRuns),
start_position=runStart_Position,
orig.id=as.integer(runIDs),
stringsAsFactors=FALSE
)
str(Runners_100)
## 'data.frame': 100 obs. of 9 variables:
## $ age : int 54 27 24 39 52 28 33 40 32 33 ...
## $ net : num 90 74.2 90.8 91.7 94.1 ...
## $ gun : num 90.3 75.1 93.5 95.2 99.4 ...
## $ sex : chr "F" "M" "F" "F" ...
## $ year : int 2004 2001 2000 2004 2005 2003 2002 2001 2004 2005 ...
## $ previous : int 5 1 0 1 1 1 1 1 2 2 ...
## $ nruns : int 9 8 4 3 4 5 4 6 3 4 ...
## $ start_position: chr "eager" "eager" "calm" "mellow" ...
## $ orig.id : int 5035 10 9271 256 1175 17334 1571 5264 15985 2237 ...
# Build a model of net running time
base_model <- lm(net ~ age + sex, data = Runners_100)
# Evaluate base_model on the training data
base_model_output <- predict(base_model, newdata = Runners_100)
# Build the augmented model
aug_model <- lm(net ~ age + sex + previous, data = Runners_100)
# Evaluate aug_model on the training data
aug_model_output <- predict(aug_model, newdata = Runners_100)
# How much do the model outputs differ?
mean((base_model_output - aug_model_output) ^ 2, na.rm = TRUE)
## [1] 0.5157921
# Build and evaluate the base model on Runners_100
base_model <- lm(net ~ age + sex, data = Runners_100)
base_model_output <- predict(base_model, newdata = Runners_100)
# Build and evaluate the augmented model on Runners_100
aug_model <- lm(net ~ age + sex + previous, data=Runners_100)
aug_model_output <- predict(aug_model, newdata = Runners_100)
# Find the case-by-case differences
base_model_differences <- with(Runners_100, net - base_model_output)
aug_model_differences <- with(Runners_100, net - aug_model_output)
# Calculate mean square errors
mean(base_model_differences ^ 2)
## [1] 131.5594
mean(aug_model_differences ^ 2)
## [1] 131.0436
data(CPS85, package="mosaicData")
# Add bogus column to CPS85 (don't change)
CPS85$bogus <- rnorm(nrow(CPS85)) > 0
# Make the base model
base_model <- lm(wage ~ educ + sector + sex, data = CPS85)
# Make the bogus augmented model
aug_model <- lm(wage ~ educ + sector + sex + bogus, data = CPS85)
# Find the MSE of the base model
mean((CPS85$wage - predict(base_model, newdata = CPS85)) ^ 2)
## [1] 19.73308
# Find the MSE of the augmented model
mean((CPS85$wage - predict(aug_model, newdata = CPS85)) ^ 2)
## [1] 19.5078
# Generate a random TRUE or FALSE for each case in Runners_100
Runners_100$training_cases <- rnorm(nrow(Runners_100)) > 0
# Build base model net ~ age + sex with training cases
base_model <-
lm(net ~ age + sex, data = subset(Runners_100, training_cases))
# Evaluate the model for the testing cases
Preds <-
statisticalModeling::evaluate_model(base_model, data = subset(Runners_100, !training_cases))
# Calculate the MSE on the testing data
with(data = Preds, mean((net - model_output)^2))
## [1] 157.0097
# The model
model <- lm(net ~ age + sex, data = Runners_100)
# Find the in-sample error (using the training data)
in_sample <- statisticalModeling::evaluate_model(model, data = Runners_100)
in_sample_error <-
with(in_sample, mean((net - model_output)^2, na.rm = TRUE))
# Calculate MSE for many different trials
trials <- statisticalModeling::cv_pred_error(model)
# View the cross-validated prediction errors
trials
## mse model
## 1 138.1343 model
## 2 143.1356 model
## 3 142.1734 model
## 4 142.8534 model
## 5 137.5054 model
# Find confidence interval on trials and compare to training_error
mosaic::t.test(~ mse, mu = in_sample_error, data = trials)
##
## One Sample t-test
##
## data: trials$mse
## t = 7.5746, df = 4, p-value = 0.001629
## alternative hypothesis: true mean is not equal to 131.5594
## 95 percent confidence interval:
## 137.3878 144.1330
## sample estimates:
## mean of x
## 140.7604
# The base model
base_model <- lm(net ~ age + sex, data = Runners_100)
# An augmented model adding previous as an explanatory variable
aug_model <- lm(net ~ age + sex + previous, data = Runners_100)
# Run cross validation trials on the two models
trials <- statisticalModeling::cv_pred_error(base_model, aug_model)
# Compare the two sets of cross-validated errors
t.test(mse ~ model, data = trials)
##
## Welch Two Sample t-test
##
## data: mse by model
## t = 1.6086, df = 7.0388, p-value = 0.1515
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.228738 6.476066
## sample estimates:
## mean in group aug_model mean in group base_model
## 142.0658 139.4421
Chapter 4 - Exploring data with models
Prediction error for categorical variables:
Exploring data for relationships - example of the NHANES data from library(NHANES):
Example code includes:
data(Runners, package="statisticalModeling")
# Build the null model with rpart()
Runners$all_the_same <- 1 # null "explanatory" variable
null_model <- rpart::rpart(start_position ~ all_the_same, data = Runners)
# Evaluate the null model on training data
null_model_output <- statisticalModeling::evaluate_model(null_model, data = Runners, type = "class")
# Calculate the error rate
with(data = null_model_output, mean(start_position != model_output, na.rm = TRUE))
## [1] 0.5853618
# Generate a random guess...
null_model_output$random_guess <- mosaic::shuffle(Runners$start_position)
# ...and find the error rate
with(data = null_model_output, mean(start_position != random_guess, na.rm = TRUE))
## [1] 0.6498309
# Train the model
model <- rpart::rpart(start_position ~ age + sex, data = Runners, cp = 0.001)
# Get model output with the training data as input
model_output <- statisticalModeling::evaluate_model(model, data = Runners, type = "class")
# Find the error rate
with(data = model_output, mean(start_position != model_output, na.rm = TRUE))
## [1] 0.5567794
# Do not have this data (should be 93x11 for Training_data and 107x11 for Testing_data) - orig.id, all_the_same, training_case
trainData <- c( 14340 , 1667 , 14863 , 15211 , 685 , 16629 , 16620 , 683 , 9695 , 4281 , 15395 , 17308 , 14847 , 2405 , 15696 , 6351 , 10266 , 14345 , 1145 , 9968 , 3409 , 3798 , 4209 , 2084 , 15561 , 7700 , 8620 , 17266 , 1638 , 13963 , 8621 , 14871 , 2945 , 14359 , 9723 , 10371 , 14271 , 826 , 4843 , 15191 , 14171 , 11845 , 15223 , 9213 , 4913 , 8194 , 15509 , 4562 , 15231 , 14317 , 2933 , 2866 , 15242 , 11343 , 15388 , 1104 , 13734 , 17186 , 5427 , 16100 , 5262 , 5873 , 5067 , 1073 , 3164 , 2164 , 1292 , 12337 , 13895 , 4379 , 11012 , 11872 , 10098 , 1130 , 1357 , 6150 , 493 , 7858 , 8761 , 18014 , 445 , 4207 , 15893 , 17022 , 703 , 17615 , 12517 , 181 , 9864 , 8611 , 4171 , 1732 , 11067 )
testData <- c( 16376 , 1316 , 15357 , 8699 , 13896 , 12064 , 13525 , 11807 , 13152 , 4473 , 12926 , 1134 , 7664 , 6597 , 17254 , 5991 , 17042 , 2701 , 2509 , 13264 , 10998 , 10482 , 7534 , 351 , 5866 , 18107 , 18046 , 15454 , 10602 , 10974 , 6988 , 7771 , 8223 , 14225 , 4409 , 2361 , 11462 , 4987 , 8440 , 2483 , 14984 , 14880 , 311 , 7505 , 4371 , 2434 , 15410 , 16068 , 16252 , 5942 , 8123 , 15375 , 15016 , 2379 , 7099 , 5664 , 11381 , 10688 , 1525 , 5506 , 4900 , 16574 , 14272 , 13912 , 3779 , 14584 , 15809 , 2908 , 16329 , 12042 , 1621 , 9248 , 5738 , 1345 , 6319 , 12575 , 3805 , 2895 , 15004 , 9918 , 11422 , 3592 , 10136 , 5941 , 12274 , 14178 , 4667 , 3393 , 11801 , 3814 , 8244 , 11721 , 14940 , 2572 , 14719 , 11398 , 13704 , 17989 , 12056 , 8215 , 8894 , 8303 , 7816 , 14698 , 17293 , 469 , 3533 )
Testing_data <- Runners[complete.cases(Runners), ][testData, ] %>%
mutate(orig.id=as.character(testData), all_the_same=1, training_case=FALSE)
Training_data <- Runners[complete.cases(Runners), ][trainData, ] %>%
mutate(orig.id=as.character(trainData), all_the_same=1, training_case=TRUE)
# Train the models
null_model <- rpart::rpart(start_position ~ all_the_same,
data = Training_data, cp = 0.001)
model_1 <- rpart::rpart(start_position ~ age,
data = Training_data, cp = 0.001)
model_2 <- rpart::rpart(start_position ~ age + sex,
data = Training_data, cp = 0.001)
# Find the out-of-sample error rate
null_output <- statisticalModeling::evaluate_model(null_model, data = Testing_data, type = "class")
model_1_output <- statisticalModeling::evaluate_model(model_1, data = Testing_data, type = "class")
model_2_output <- statisticalModeling::evaluate_model(model_2, data = Testing_data, type = "class")
# Calculate the error rates
null_rate <- with(data = null_output,
mean(start_position != model_output, na.rm = TRUE))
model_1_rate <- with(data = model_1_output,
mean(start_position != model_output, na.rm = TRUE))
model_2_rate <- with(data = model_2_output,
mean(start_position != model_output, na.rm = TRUE))
# Display the error rates
null_rate
## [1] 0.5233645
model_1_rate
## [1] 0.588785
model_2_rate
## [1] 0.5700935
model_2 <- rpart::rpart(net ~ age + sex, data = Runners, cp = 0.001)
rpart.plot::prp(model_2, type = 3)
data(Birth_weight, package="statisticalModeling")
model_1 <- rpart::rpart(baby_wt ~ smoke + income,
data = Birth_weight)
model_2 <- rpart::rpart(baby_wt ~ mother_age + mother_wt,
data = Birth_weight)
rpart.plot::prp(model_1, type = 3)
rpart.plot::prp(model_2, type = 3)
model_3 <- rpart::rpart(baby_wt ~ smoke + income + mother_age + mother_wt, data=Birth_weight)
rpart.plot::prp(model_3, type=3)
model_full <- rpart::rpart(baby_wt ~ ., data=Birth_weight)
rpart.plot::prp(model_full, type=3)
model_gest <- rpart::rpart(gestation ~ . -baby_wt, data=Birth_weight)
rpart.plot::prp(model_gest, type=3)
Chapter 5 - Covariates and Effect Size
Covariates and uses for models - making predictions with available data, exploring a large/complex dataset, anticipate outcome of intervention:
Effect size - how much does the model output change for a given change in the input?
Example code includes:
data(Houses_for_sale, package="statisticalModeling")
# Train the model price ~ fireplaces
simple_model <- lm(price ~ fireplaces, data = Houses_for_sale)
# Evaluate simple_model
statisticalModeling::evaluate_model(simple_model)
## fireplaces model_output
## 1 0 171823.9
## 2 1 238522.7
naive_worth <- 238522.7 - 171823.9
naive_worth
## [1] 66698.8
# Train another model including living_area
sophisticated_model <-lm(price ~ fireplaces + living_area, data = Houses_for_sale)
# Evaluate that model
statisticalModeling::evaluate_model(sophisticated_model)
## fireplaces living_area model_output
## 1 0 1000 124043.6
## 2 1 1000 133006.1
## 3 0 2000 233357.1
## 4 1 2000 242319.5
## 5 0 3000 342670.6
## 6 1 3000 351633.0
# Find price difference for fixed living_area
sophisticated_worth <- 242319.5 - 233357.1
sophisticated_worth
## [1] 8962.4
data(Crime, package="statisticalModeling")
# Train model_1 and model_2
model_1 <- lm(R ~ X, data = Crime)
model_2 <- lm(R ~ W, data = Crime)
# Evaluate each model...
statisticalModeling::evaluate_model(model_1)
## X model_output
## 1 100 106.82223
## 2 200 89.46721
## 3 300 72.11219
statisticalModeling::evaluate_model(model_2)
## W model_output
## 1 400 68.32909
## 2 600 103.70777
## 3 800 139.08644
change_with_X <- 89.46721 - 106.82223
change_with_X
## [1] -17.35502
change_with_W <- 103.70777 - 68.32909
change_with_W
## [1] 35.37868
# Train model_3 using both X and W as explanatory variables
model_3 <- lm(R ~ X + W, data = Crime)
# Evaluate model_3
statisticalModeling::evaluate_model(model_3)
## X W model_output
## 1 100 400 -62.60510
## 2 200 400 31.03422
## 3 300 400 124.67354
## 4 100 600 41.22502
## 5 200 600 134.86434
## 6 300 600 228.50366
## 7 100 800 145.05515
## 8 200 800 238.69447
## 9 300 800 332.33379
# Find the difference in output for each of X and W
change_with_X_holding_W_constant <- 134.86434 - 228.50366
change_with_X_holding_W_constant
## [1] -93.63932
change_with_W_holding_X_constant <- 134.86434 - 31.03422
change_with_W_holding_X_constant
## [1] 103.8301
data(Trucking_jobs, package="statisticalModeling")
# Train the five models
model_1 <- lm(earnings ~ sex, data = Trucking_jobs)
model_2 <- lm(earnings ~ sex + age, data = Trucking_jobs)
model_3 <- lm(earnings ~ sex + hiredyears, data = Trucking_jobs)
model_4 <- lm(earnings ~ sex + title, data = Trucking_jobs)
model_5 <- lm(earnings ~ sex + age + hiredyears + title, data = Trucking_jobs)
# Evaluate each model...
statisticalModeling::evaluate_model(model_1)
## sex model_output
## 1 M 40236.35
## 2 F 35501.25
statisticalModeling::evaluate_model(model_2, age = 40)
## sex age model_output
## 1 M 40 41077.03
## 2 F 40 38722.71
statisticalModeling::evaluate_model(model_3, hiredyears = 5)
## sex hiredyears model_output
## 1 M 5 39996.93
## 2 F 5 36366.89
statisticalModeling::evaluate_model(model_4, title = "REGL CARRIER REP")
## sex title model_output
## 1 M REGL CARRIER REP 27838.38
## 2 F REGL CARRIER REP 28170.71
statisticalModeling::evaluate_model(model_5, age = 40, hiredyears = 5,
title = "REGL CARRIER REP")
## sex age hiredyears title model_output
## 1 M 40 5 REGL CARRIER REP 30976.42
## 2 F 40 5 REGL CARRIER REP 30991.70
# ...and calculate the gender difference in earnings
diff_1 <- 40236.35 - 35501.25
diff_1
## [1] 4735.1
diff_2 <- 41077.03 - 38722.71
diff_2
## [1] 2354.32
diff_3 <- 39996.93 - 36366.89
diff_3
## [1] 3630.04
diff_4 <- 27838.38 - 28170.71
diff_4
## [1] -332.33
diff_5 <- 30976.42 - 30991.70
diff_5
## [1] -15.28
data(AARP, package="statisticalModeling")
modLin <- lm(Cost ~ Age + Sex + Coverage, data=AARP)
statisticalModeling::evaluate_model(modLin)
## Age Sex Coverage model_output
## 1 40 F 0 -66.4702087
## 2 60 F 0 0.5638866
## 3 80 F 0 67.5979818
## 4 40 M 0 -56.2374309
## 5 60 M 0 10.7966643
## 6 80 M 0 77.8307596
## 7 40 F 50 24.2980606
## 8 60 F 50 91.3321558
## 9 80 F 50 158.3662510
## 10 40 M 50 34.5308383
## 11 60 M 50 101.5649336
## 12 80 M 50 168.5990288
statisticalModeling::effect_size(modLin, ~ Age)
## slope Age to:Age Sex Coverage
## 1 3.351705 59.5 68.16025 F 20
statisticalModeling::effect_size(modLin, ~ Sex)
## change Sex to:Sex Age Coverage
## 1 10.23278 F M 59.5 20
statisticalModeling::effect_size(modLin, ~ Coverage)
## slope Coverage to:Coverage Age Sex
## 1 1.815365 20 37.23783 59.5 F
data(College_grades, package="statisticalModeling")
# Calculating the GPA
gpa_mod_1 <- lm(gradepoint ~ sid, data = College_grades)
# The GPA for two students
statisticalModeling::evaluate_model(gpa_mod_1, sid = c("S32115", "S32262"))
## sid model_output
## 1 S32115 3.448571
## 2 S32262 3.442500
# Use effect_size()
statisticalModeling::effect_size(gpa_mod_1, ~ sid)
## change sid to:sid
## 1 0.4886364 S32259 S32364
# Specify from and to levels to compare
statisticalModeling::effect_size(gpa_mod_1, ~ sid, sid = "S32115", to = "S32262")
## change sid to:sid
## 1 -0.006071429 S32115 S32262
# A better model?
gpa_mod_2 <- lm(gradepoint ~ sid + dept + level, data = College_grades)
# Find difference between the same two students as before
statisticalModeling::effect_size(gpa_mod_2, ~ sid, sid = "S32115", to = "S32262")
## change sid to:sid dept level
## 1 0.4216295 S32115 S32262 d 200
data(Houses_for_sale, package="statisticalModeling")
modAll <- lm(price ~ living_area + land_value + fireplaces, data=Houses_for_sale)
statisticalModeling::effect_size(modAll, ~ land_value)
## slope land_value to:land_value living_area fireplaces
## 1 0.9559322 25000 60021.17 1634.5 1
statisticalModeling::effect_size(modAll, ~ fireplaces)
## slope fireplaces to:fireplaces living_area land_value
## 1 8100.298 1 1.556102 1634.5 25000
statisticalModeling::effect_size(modAll, ~ living_area)
## slope living_area to:living_area land_value fireplaces
## 1 86.81317 1634.5 2254.436 25000 1
Chapter 1 - Effect Size and Interaction
Multiple explanatory variables - commonly use mean/median for each continuous variable, and most common for categorical:
Categorical response variables - output is a classification rather than continuous:
Interactions among explanatory variables:
Example code includes:
data(Houses_for_sale, package="statisticalModeling")
# Build your model
my_model <- rpart::rpart(price ~ living_area + bathrooms + pct_college,
data = Houses_for_sale)
# Graph the model
statisticalModeling::fmodel(my_model, ~ living_area + bathrooms + pct_college)
data(NHANES, package="NHANES")
# Build the model
mod <- lm(Pulse ~ Height + BMI + Gender, data = NHANES)
# Confirm by reconstructing the graphic provided
statisticalModeling::fmodel(mod, ~ Height + BMI + Gender) +
ggplot2::ylab("Pulse")
# Find effect size
statisticalModeling::effect_size(mod, ~ BMI)
## slope BMI to:BMI Height Gender
## 1 0.06025728 25.98 33.35658 166 female
# Replot the model
statisticalModeling::fmodel(mod, ~ BMI + Height + Gender) +
ggplot2::ylab("Pulse")
model_1 <- rpart::rpart(start_position ~ age + sex + nruns,
data = Runners, cp = 0.001)
as_class <- statisticalModeling::evaluate_model(model_1, type = "class")
as_prob <- statisticalModeling::evaluate_model(model_1)
# Calculate effect size with respect to sex
statisticalModeling::effect_size(model_1, ~ sex)
## change.calm change.eager change.mellow sex to:sex age nruns
## 1 0.01281487 -0.2192357 0.2064208 M F 40 4
# Calculate effect size with respect to age
statisticalModeling::effect_size(model_1, ~ age)
## slope.calm slope.eager slope.mellow age to:age sex nruns
## 1 0.00497811 -0.01316334 0.008185229 40 50.84185 M 4
# Calculate effect size with respect to nruns
statisticalModeling::effect_size(model_1, ~ nruns)
## slope.calm slope.eager slope.mellow nruns to:nruns age sex
## 1 0.004900487 0.02725955 -0.03216004 4 5.734239 40 M
data(Whickham, package="mosaicData")
# An rpart model
mod1 <- rpart::rpart(outcome ~ age + smoker, data = Whickham)
# Logistic regression
mod2 <- glm(outcome == "Alive" ~ age + smoker,
data = Whickham, family = "binomial")
# Visualize the models with fmodel()
statisticalModeling::fmodel(mod1)
statisticalModeling::fmodel(mod2)
# Find the effect size of smoker
statisticalModeling::effect_size(mod1, ~ smoker)
## change.Alive change.Dead smoker to:smoker age
## 1 0 0 No Yes 46
statisticalModeling::effect_size(mod2, ~ smoker)
## change smoker to:smoker age
## 1 -0.02479699 No Yes 46
data(Birth_weight, package="statisticalModeling")
# Build the model without interaction
mod1 <- lm(baby_wt ~ gestation + smoke, data=Birth_weight)
# Build the model with interaction
mod2 <- lm(baby_wt ~ gestation * smoke, data=Birth_weight)
# Plot each model
statisticalModeling::fmodel(mod1) +
ggplot2::ylab("baby_wt")
statisticalModeling::fmodel(mod2) +
ggplot2::ylab("baby_wt")
data(Used_Fords, package="statisticalModeling")
# Train model_1
model_1 <- lm(Price ~ Age + Mileage,
data = Used_Fords)
# Train model_2
model_2 <- lm(Price ~ Age * Mileage,
data = Used_Fords)
# Plot both models
statisticalModeling::fmodel(model_1)
statisticalModeling::fmodel(model_2)
# Cross validate and compare prediction errors
res <- statisticalModeling::cv_pred_error(model_1, model_2)
t.test(mse ~ model, data = res)
##
## Welch Two Sample t-test
##
## data: mse by model
## t = 556.38, df = 6.3179, p-value = 4.66e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2424270 2445428
## sample estimates:
## mean in group model_1 mean in group model_2
## 6086599 3651749
Chapter 2 - Total and Partial Change
Interpreting effect size - magnitude is important, but only if interpreted properly (e.g., units per):
R-squared is also known as the “coefficient of determination” and uses a capital R:
Degrees of freedom - Kaggle example based on restaurant data (137 x 40 with City, City.Group, Type, PS1-PS37 and a 137x1 vector Revenue):
Example code includes:
data(Houses_for_sale, package="statisticalModeling")
# Train a model of house prices
price_model_1 <- lm(price ~ land_value + living_area + fireplaces + bathrooms + bedrooms,
data = Houses_for_sale
)
# Effect size of living area
statisticalModeling::effect_size(price_model_1, ~ living_area)
## slope living_area to:living_area land_value fireplaces bathrooms
## 1 76.06617 1634.5 2254.436 25000 1 2
## bedrooms
## 1 3
# Effect size of bathrooms
statisticalModeling::effect_size(price_model_1, ~ bathrooms, step=1)
## slope bathrooms to:bathrooms land_value living_area fireplaces
## 1 26156.43 2 3 25000 1634.5 1
## bedrooms
## 1 3
# Effect size of bedrooms
statisticalModeling::effect_size(price_model_1, ~ bedrooms, step=1)
## slope bedrooms to:bedrooms land_value living_area fireplaces
## 1 -8222.853 3 4 25000 1634.5 1
## bathrooms
## 1 2
# Let living_area change as it will
price_model_2 <- lm(price ~ land_value + fireplaces + bathrooms + bedrooms,
data = Houses_for_sale
)
# Effect size of bedroom in price_model_2
statisticalModeling::effect_size(price_model_2, ~ bedrooms, step=1)
## slope bedrooms to:bedrooms land_value fireplaces bathrooms
## 1 13882.42 3 4 25000 1 2
# Train a model of house prices
price_model <- lm(price ~ land_value + living_area + fireplaces + bathrooms + bedrooms,
data = Houses_for_sale
)
# Evaluate the model in scenario 1
statisticalModeling::evaluate_model(price_model, living_area = 2000, bedrooms = 2, bathrooms = 1)
## land_value living_area fireplaces bathrooms bedrooms model_output
## 1 0 2000 0 1 2 181624.0
## 2 50000 2000 0 1 2 228787.1
## 3 0 2000 1 1 2 185499.2
## 4 50000 2000 1 1 2 232662.4
# Evaluate the model in scenario 2
statisticalModeling::evaluate_model(price_model, living_area = 2140, bedrooms = 3, bathrooms = 1)
## land_value living_area fireplaces bathrooms bedrooms model_output
## 1 0 2140 0 1 3 184050.4
## 2 50000 2140 0 1 3 231213.5
## 3 0 2140 1 1 3 187925.7
## 4 50000 2140 1 1 3 235088.8
# Find the difference in output
price_diff <- 231213.5 - 228787.1
price_diff
## [1] 2426.4
# Evaluate the second scenario again, but add a half bath
statisticalModeling::evaluate_model(price_model, living_area = 2165, bedrooms = 3, bathrooms = 1.5)
## land_value living_area fireplaces bathrooms bedrooms model_output
## 1 0 2165 0 1.5 3 199030.3
## 2 50000 2165 0 1.5 3 246193.4
## 3 0 2165 1 1.5 3 202905.5
## 4 50000 2165 1 1.5 3 250068.7
# Calculate the price difference
new_price_diff <- 246193.4 - 228787.1
new_price_diff
## [1] 17406.3
# Fit model
car_price_model <- lm(Price ~ Age + Mileage, data = Used_Fords)
# Partial effect size
statisticalModeling::effect_size(car_price_model, ~ Age)
## slope Age to:Age Mileage
## 1 -573.5044 3 6.284152 48897.5
# To find total effect size
statisticalModeling::evaluate_model(car_price_model, Age = 6, Mileage = 42000)
## Age Mileage model_output
## 1 6 42000 9523.781
statisticalModeling::evaluate_model(car_price_model, Age = 7, Mileage = 50000)
## Age Mileage model_output
## 1 7 50000 8400.389
# Price difference between scenarios (round to nearest dollar)
price_difference <- 8400 - 9524
price_difference
## [1] -1124
# Effect for age without mileage in the model
car_price_model_2 <- lm(Price ~ Age, data = Used_Fords)
# Calculate partial effect size
statisticalModeling::effect_size(car_price_model_2, ~ Age)
## slope Age to:Age
## 1 -1124.556 3 6.284152
data(College_grades, package="statisticalModeling")
data(AARP, package="statisticalModeling")
data(Tadpoles, package="statisticalModeling")
College_grades <- College_grades[complete.cases(College_grades), ]
# Train some models
model_1 <- lm(gradepoint ~ sid, data = College_grades)
model_2 <- lm(Cost ~ Age + Sex + Coverage, data = AARP)
model_3 <- lm(vmax ~ group + (rtemp + I(rtemp^2)), data = Tadpoles)
# Calculate model output on training data
output_1 <- statisticalModeling::evaluate_model(model_1, data = College_grades)
output_2 <- statisticalModeling::evaluate_model(model_2, data = AARP)
output_3 <- statisticalModeling::evaluate_model(model_3, data = Tadpoles)
# R-squared for the models
with(output_1, var(model_output) / var(gradepoint))
## [1] 0.3222716
with(output_2, var(model_output) / var(Cost))
## [1] 0.8062783
with(output_3, var(model_output) / var(vmax))
## [1] 0.4310651
data(HDD_Minneapolis, package="statisticalModeling")
# The two models
model_1 <- lm(hdd ~ year, data = HDD_Minneapolis)
model_2 <- lm(hdd ~ month, data = HDD_Minneapolis)
# Find the model output on the training data for each model
output_1 <- statisticalModeling::evaluate_model(model_1, data = HDD_Minneapolis)
output_2 <- statisticalModeling::evaluate_model(model_2, data = HDD_Minneapolis)
# Find R-squared for each of the 2 models
with(output_1, var(model_output) / var(hdd))
## [1] 0.0001121255
with(output_2, var(model_output) / var(hdd))
## [1] 0.9547171
# DO NOT HAVE THIS DATASET - Training is 267 x 12 (field 12 is "bogus", a 267x200 matrix of random numbers)
# Train model_1 without bogus
# model_1 <- lm(wage ~ sector, data = Training)
# Train model_2 with bogus
# model_2 <- lm(wage ~ sector + bogus, data = Training)
# Calculate R-squared using the training data
# output_1 <- statisticalModeling::evaluate_model(model_1, data = Training)
# output_2 <- statisticalModeling::evaluate_model(model_2, data = Training)
# with(output_1, var(model_output) / var(wage))
# with(output_2, var(model_output) / var(wage))
# Compare cross-validated MSE
# boxplot(mse ~ model, data = statisticalModeling::cv_pred_error(model_1, model_2))
data(CPS85, package="mosaicData")
# Train the four models
model_0 <- lm(wage ~ NULL, data = CPS85)
model_1 <- lm(wage ~ mosaic::rand(100), data = CPS85)
model_2 <- lm(wage ~ mosaic::rand(200), data = CPS85)
model_3 <- lm(wage ~ mosaic::rand(300), data = CPS85)
# Evaluate the models on the training data
output_0 <- statisticalModeling::evaluate_model(model_0, on_training = TRUE)
output_1 <- statisticalModeling::evaluate_model(model_1, on_training = TRUE)
output_2 <- statisticalModeling::evaluate_model(model_2, on_training = TRUE)
output_3 <- statisticalModeling::evaluate_model(model_3, on_training = TRUE)
# Compute R-squared for each model
with(output_0, var(model_output) / var(wage))
## [1] 0
with(output_1, var(model_output) / var(wage))
## [1] 0.1885643
with(output_2, var(model_output) / var(wage))
## [1] 0.3537312
with(output_3, var(model_output) / var(wage))
## [1] 0.5709465
# Compare the null model to model_3 using cross validation
cv_results <- statisticalModeling::cv_pred_error(model_0, model_3, ntrials = 3)
boxplot(mse ~ model, data = cv_results)
# Train this model with 24 degrees of freedom
model_1 <- lm(hdd ~ year * month, data = HDD_Minneapolis)
# Calculate R-squared
output_1 <- statisticalModeling::evaluate_model(model_1, data = HDD_Minneapolis)
with(output_1, var(model_output) / var(hdd))
## [1] 0.9554951
# Oops! Numerical year changed to categorical
HDD_Minneapolis$categorical_year <- as.character(HDD_Minneapolis$year)
# This model has many more degrees of freedom
model_2 <- lm(hdd ~ categorical_year * month, data = HDD_Minneapolis)
# Calculate R-squared
output_2 <- statisticalModeling::evaluate_model(model_2, data = HDD_Minneapolis)
## Warning in predict.lm(structure(list(coefficients =
## structure(c(580.000000000084, : prediction from a rank-deficient fit may be
## misleading
with(output_2, var(model_output) / var(hdd))
## [1] 1
Chapter 3 - Sampling Variability
Bootstrapping and precision - applying CI and the like to assess the precision of statistical models:
Scales and transformations - what do the numbers actually represent?
Example code includes:
data(CPS85, package="mosaicData")
# Two starting elements
model <- lm(wage ~ age + sector, data = CPS85)
statisticalModeling::effect_size(model, ~ age)
## slope age to:age sector
## 1 0.07362793 35 46.72657 prof
# For practice
my_test_resample <- sample(1:10, replace = TRUE)
my_test_resample
## [1] 6 3 4 6 7 8 3 10 8 8
# Construct a resampling of CPS85
trial_1_indices <- sample(1:nrow(CPS85), replace = TRUE)
trial_1_data <- CPS85[trial_1_indices, ]
# Train the model to that resampling
trial_1_model <- lm(wage ~ age + sector, data = trial_1_data)
# Calculate the quantity
statisticalModeling::effect_size(trial_1_model, ~ age)
## slope age to:age sector
## 1 -0.006353344 35 46.72057 service
# Model and effect size from the "real" data
model <- lm(wage ~ age + sector, data = CPS85)
statisticalModeling::effect_size(model, ~ age)
## slope age to:age sector
## 1 0.07362793 35 46.72657 prof
# Generate 10 resampling trials
my_trials <- statisticalModeling::ensemble(model, nreps = 10)
# Find the effect size for each trial
statisticalModeling::effect_size(my_trials, ~ age)
## slope age to:age sector bootstrap_rep
## 1 0.07264900 35 46.72657 prof 1
## 11 0.09594431 35 46.72657 prof 2
## 12 0.05815393 35 46.72657 prof 3
## 13 0.05747240 35 46.72657 prof 4
## 14 0.07871289 35 46.72657 prof 5
## 15 0.08003315 35 46.72657 prof 6
## 16 0.07585373 35 46.72657 prof 7
## 17 0.06480836 35 46.72657 prof 8
## 18 0.08341854 35 46.72657 prof 9
## 19 0.04212500 35 46.72657 prof 10
# Re-do with 100 trials
my_trials <- statisticalModeling::ensemble(model, nreps = 100)
trial_effect_sizes <- statisticalModeling::effect_size(my_trials, ~ age)
# Calculate the standard deviation of the 100 effect sizes
sd(trial_effect_sizes$slope)
## [1] 0.02008143
# An estimate of the value of a fireplace
model <- lm(price ~ land_value + fireplaces + living_area,
data = Houses_for_sale
)
statisticalModeling::effect_size(model, ~ fireplaces)
## slope fireplaces to:fireplaces land_value living_area
## 1 8100.298 1 1.556102 25000 1634.5
# Generate 100 resampling trials
trials <- statisticalModeling::ensemble(model, nreps = 100)
# Calculate the effect size in each of the trials
effect_sizes_in_trials <- statisticalModeling::effect_size(trials, ~ fireplaces)
# Show a histogram of the effect sizes
hist(effect_sizes_in_trials$slope)
# Calculate the standard error
sd(effect_sizes_in_trials$slope)
## [1] 3371.1
data(AARP, package="statisticalModeling")
# Make model with log(Cost)
mod_1 <- lm(log(Cost) ~ Age + Sex + Coverage, data = AARP)
mod_2 <- lm(log(Cost) ~ Age * Sex + Coverage, data = AARP)
mod_3 <- lm(log(Cost) ~ Age * Sex + log(Coverage), data = AARP)
mod_4 <- lm(log(Cost) ~ Age * Sex * log(Coverage), data = AARP)
# To display each model in turn
statisticalModeling::fmodel(mod_1, ~ Age + Sex + Coverage,
Coverage = c(10, 20, 50)) +
ggplot2::geom_point(data = AARP, alpha = 0.5,
aes(y = log(Cost), color = Sex))
statisticalModeling::fmodel(mod_2, ~ Age + Sex + Coverage,
Coverage = c(10, 20, 50)) +
ggplot2::geom_point(data = AARP, alpha = 0.5,
aes(y = log(Cost), color = Sex))
statisticalModeling::fmodel(mod_3, ~ Age + Sex + Coverage,
Coverage = c(10, 20, 50)) +
ggplot2::geom_point(data = AARP, alpha = 0.5,
aes(y = log(Cost), color = Sex))
statisticalModeling::fmodel(mod_4, ~ Age + Sex + Coverage,
Coverage = c(10, 20, 50)) +
ggplot2::geom_point(data = AARP, alpha = 0.5,
aes(y = log(Cost), color = Sex))
# Use cross validation to compare mod_4 and mod_1
results <- statisticalModeling::cv_pred_error(mod_1, mod_4)
boxplot(mse ~ model, data = results)
data(Oil_history, package="statisticalModeling")
str(Oil_history)
## Classes 'tbl_df', 'tbl' and 'data.frame': 63 obs. of 2 variables:
## $ year: int 1880 1890 1900 1905 1910 1915 1920 1925 1930 1935 ...
## $ mbbl: num 30 77 149 215 328 ...
Oil_production <- Oil_history %>%
filter(year <= 1968) %>%
mutate(log_mbbl=log(mbbl))
str(Oil_production)
## Classes 'tbl_df', 'tbl' and 'data.frame': 19 obs. of 3 variables:
## $ year : int 1880 1890 1900 1905 1910 1915 1920 1925 1930 1935 ...
## $ mbbl : num 30 77 149 215 328 ...
## $ log_mbbl: num 3.4 4.34 5 5.37 5.79 ...
ggplot(Oil_production, aes(x=year, y=mbbl)) +
geom_point() +
geom_line()
# Model of oil production in mbbl
model_1 <- lm(mbbl ~ year, data = Oil_production)
# Plot model_1 with scatterplot of mbbl vs. year
statisticalModeling::fmodel(model_1, data = Oil_production) +
geom_point(data = Oil_production)
# Effect size of year
statisticalModeling::effect_size(model_1, ~ year)
## slope year to:year
## 1 140.3847 1935 1962.324
# Model of log-transformed production
model_2 <- lm(log_mbbl ~ year, data = Oil_production)
# Plot model_2 with scatterplot of mbbl vs. year
statisticalModeling::fmodel(model_2, data = Oil_production) +
geom_point(data = Oil_production)
# And the effect size on log-transformed production
statisticalModeling::effect_size(model_2, ~ year)
## slope year to:year
## 1 0.06636971 1935 1962.324
# Annual growth
100 * (exp(round(0.06637, 3)) - 1)
## [1] 6.822672
data(Used_Fords, package="statisticalModeling")
# A model of price
model_1 <- lm(Price ~ Mileage + Age, data = Used_Fords)
# A model of logarithmically transformed price
Used_Fords$log_price <- log(Used_Fords$Price)
model_2 <- lm(log_price ~ Mileage + Age, data = Used_Fords)
# The model values on the original cases
preds_1 <- statisticalModeling::evaluate_model(model_1, data = Used_Fords)
# The model output for model_2 - giving log price
preds_2 <- statisticalModeling::evaluate_model(model_2, data = Used_Fords)
# Transform predicted log price to price
preds_2$model_price <- exp(preds_2$model_output)
# Mean square errors in price
mean((preds_1$Price - preds_1$model_output)^2, na.rm = TRUE)
## [1] 6026231
mean((preds_2$Price - preds_2$model_price)^2, na.rm = TRUE)
## [1] 3711549
data(Used_Fords, package="statisticalModeling")
# A model of logarithmically transformed price
model <- lm(log(Price) ~ Mileage + Age, data = Used_Fords)
# Create the bootstrap replications
bootstrap_reps <- statisticalModeling::ensemble(model, nreps = 100, data = Used_Fords)
# Find the effect size
age_effect <- statisticalModeling::effect_size(bootstrap_reps, ~ Age)
# Change the slope to a percent change
age_effect$percent_change <- 100 * (exp(age_effect$slope) - 1)
# Find confidence interval
with(age_effect, mean(percent_change) + c(-2, 2) * sd(percent_change))
## [1] -9.211535 -7.382512
Chapter 4 - Variables Working Together
Confidence and collinearity - managing covariates appropriately to reflect mechanisms of the real-world:
Example code includes:
data(CPS85, package="mosaicData")
# A model of wage
model_1 <- lm(wage ~ educ + sector + exper + age, data = CPS85)
# Effect size of educ on wage
statisticalModeling::effect_size(model_1, ~ educ)
## slope educ to:educ sector exper age
## 1 0.5732615 12 14.61537 prof 15 35
# Examine confidence interval on effect size
ensemble_1 <- statisticalModeling::ensemble(model_1, nreps = 100)
effect_from_1 <- suppressWarnings(statisticalModeling::effect_size(ensemble_1, ~ educ))
with(effect_from_1, mean(slope) + c(-2, 2) * sd(slope))
## [1] 0.272480 1.012341
# Collinearity inflation factor on standard error
statisticalModeling::collinearity( ~ educ + sector + exper + age, data = CPS85)
## expl_vars SeIF
## 1 educ 15.273900
## 2 sectorconst 1.090245
## 3 sectormanag 1.215769
## 4 sectormanuf 1.252303
## 5 sectorother 1.239831
## 6 sectorprof 1.405901
## 7 sectorsales 1.137992
## 8 sectorservice 1.274175
## 9 exper 71.980564
## 10 age 68.116772
# Leave out covariates one at a time
statisticalModeling::collinearity( ~ educ + sector + exper, data = CPS85) # leave out age
## expl_vars SeIF
## 1 educ 1.380220
## 2 sectorconst 1.090245
## 3 sectormanag 1.215761
## 4 sectormanuf 1.252303
## 5 sectorother 1.239814
## 6 sectorprof 1.402902
## 7 sectorsales 1.137990
## 8 sectorservice 1.274174
## 9 exper 1.092803
statisticalModeling::collinearity( ~ educ + sector + age, data = CPS85) # leave out exper
## expl_vars SeIF
## 1 educ 1.311022
## 2 sectorconst 1.090245
## 3 sectormanag 1.215754
## 4 sectormanuf 1.252302
## 5 sectorother 1.239801
## 6 sectorprof 1.402764
## 7 sectorsales 1.137990
## 8 sectorservice 1.274174
## 9 age 1.034143
statisticalModeling::collinearity( ~ educ + exper + age, data = CPS85) # leave out sector
## expl_vars SeIF
## 1 educ 15.15169
## 2 exper 71.74900
## 3 age 67.90730
# Improved model leaving out worst offending covariate
model_2 <- lm(wage ~ educ + sector + age, data = CPS85)
# Confidence interval of effect size of educ on wage
ensemble_2 <- statisticalModeling::ensemble(model_2, nreps = 100)
effect_from_2 <- statisticalModeling::effect_size(ensemble_2, ~ educ)
with(effect_from_2, mean(slope) + c(-2, 2) * sd(slope))
## [1] 0.4353022 0.8946416
data(Used_Fords, package="statisticalModeling")
# Train a model Price ~ Age + Mileage
model_1 <- lm(Price ~ Age + Mileage, data = Used_Fords)
# Train a similar model including the interaction
model_2 <- lm(Price ~ Age * Mileage, data = Used_Fords)
# Compare cross-validated prediction error
statisticalModeling::cv_pred_error(model_1, model_2)
## mse model
## 1 6122725 model_1
## 2 6136052 model_1
## 3 6109931 model_1
## 4 6091905 model_1
## 5 6096249 model_1
## 6 3649625 model_2
## 7 3635689 model_2
## 8 3650510 model_2
## 9 3650551 model_2
## 10 3670668 model_2
# Use bootstrapping to find conf. interval on effect size of Age
ensemble_1 <- statisticalModeling::ensemble(model_1, nreps = 100)
ensemble_2 <- statisticalModeling::ensemble(model_2, nreps = 100)
effect_from_1 <- statisticalModeling::effect_size(ensemble_1, ~ Age)
effect_from_2 <- statisticalModeling::effect_size(ensemble_2, ~ Age)
with(effect_from_1, mean(slope) + c(-2, 2) * sd(slope))
## [1] -655.2477 -491.4356
with(effect_from_2, mean(slope) + c(-2, 2) * sd(slope))
## [1] -961.6264 -812.9487
# Compare inflation for the model with and without interaction
statisticalModeling::collinearity(~ Age + Mileage, data = Used_Fords)
## expl_vars SeIF
## 1 Age 1.5899
## 2 Mileage 1.5899
statisticalModeling::collinearity(~ Age * Mileage, data = Used_Fords)
## expl_vars SeIF
## 1 Age 2.510430
## 2 Mileage 2.147278
## 3 Age:Mileage 3.349224
Chapter 1 - Exploratory Time Series Data Analysis
Time series is a sequence of data in chronological order (recorded sequentially over time), especially common in finance and economics:
Sampling frequency - some time series data is evenly spaced, other time series data is only approximately evenly spaced:
Basic time series objects - start with a vector of numbers, add an index using the ts() or other functions:
Example code includes:
data(Nile, package="datasets")
# Print the Nile dataset
print(Nile)
## Time Series:
## Start = 1871
## End = 1970
## Frequency = 1
## [1] 1120 1160 963 1210 1160 1160 813 1230 1370 1140 995 935 1110 994
## [15] 1020 960 1180 799 958 1140 1100 1210 1150 1250 1260 1220 1030 1100
## [29] 774 840 874 694 940 833 701 916 692 1020 1050 969 831 726
## [43] 456 824 702 1120 1100 832 764 821 768 845 864 862 698 845
## [57] 744 796 1040 759 781 865 845 944 984 897 822 1010 771 676
## [71] 649 846 812 742 801 1040 860 874 848 890 744 749 838 1050
## [85] 918 986 797 923 975 815 1020 906 901 1170 912 746 919 718
## [99] 714 740
# List the number of observations in the Nile dataset
length(Nile)
## [1] 100
# Display the first 10 elements of the Nile dataset
head(Nile, n=10)
## [1] 1120 1160 963 1210 1160 1160 813 1230 1370 1140
# Display the last 12 elements of the Nile dataset
tail(Nile, n=12)
## [1] 975 815 1020 906 901 1170 912 746 919 718 714 740
# Plot the Nile data
plot(Nile)
# Plot the Nile data with xlab and ylab arguments
plot(Nile, xlab = "Year", ylab = "River Volume (1e9 m^{3})")
# Plot the Nile data with xlab, ylab, main, and type arguments
plot(Nile, xlab = "Year", ylab = "River Volume (1e9 m^{3})",
main="Annual River Nile Volume at Aswan, 1871-1970", type="b"
)
continuous_series <- c( 0.5689 , 0.7663 , 0.9921 , 0.9748 , 0.3991 , 0.3766 , -0.3853 , -0.8364 , -0.9997 , -0.9983 , -0.6462 , -0.0939 , 0.4005 , 0.6816 , 0.9532 , 0.9969 , 0.8393 , 0.37 , -0.2551 , -0.6174 )
continuous_time_index <- c( 1.2103 , 1.7461 , 2.8896 , 3.5914 , 5.4621 , 5.5109 , 7.0743 , 8.2644 , 9.3734 , 9.5411 , 11.1611 , 12.3784 , 13.3906 , 14.0663 , 15.0935 , 15.8645 , 16.8574 , 18.0915 , 19.3655 , 20.1805 )
# Plot the continuous_series using continuous time indexing
par(mfrow=c(2,1))
plot(continuous_time_index, continuous_series, type = "b")
# Make a discrete time index using 1:20
discrete_time_index <- 1:20
# Now plot the continuous_series using discrete time indexing
plot(discrete_time_index, continuous_series, type = "b")
par(mfrow=c(1, 1))
data(AirPassengers, package="datasets")
str(AirPassengers)
## Time-Series [1:144] from 1949 to 1961: 112 118 132 129 121 135 148 148 136 119 ...
# Plot AirPassengers
plot(AirPassengers)
# View the start and end dates of AirPassengers
start(AirPassengers)
## [1] 1949 1
end(AirPassengers)
## [1] 1960 12
# Use time(), deltat(), frequency(), and cycle() with AirPassengers
time(AirPassengers)
## Jan Feb Mar Apr May Jun Jul
## 1949 1949.000 1949.083 1949.167 1949.250 1949.333 1949.417 1949.500
## 1950 1950.000 1950.083 1950.167 1950.250 1950.333 1950.417 1950.500
## 1951 1951.000 1951.083 1951.167 1951.250 1951.333 1951.417 1951.500
## 1952 1952.000 1952.083 1952.167 1952.250 1952.333 1952.417 1952.500
## 1953 1953.000 1953.083 1953.167 1953.250 1953.333 1953.417 1953.500
## 1954 1954.000 1954.083 1954.167 1954.250 1954.333 1954.417 1954.500
## 1955 1955.000 1955.083 1955.167 1955.250 1955.333 1955.417 1955.500
## 1956 1956.000 1956.083 1956.167 1956.250 1956.333 1956.417 1956.500
## 1957 1957.000 1957.083 1957.167 1957.250 1957.333 1957.417 1957.500
## 1958 1958.000 1958.083 1958.167 1958.250 1958.333 1958.417 1958.500
## 1959 1959.000 1959.083 1959.167 1959.250 1959.333 1959.417 1959.500
## 1960 1960.000 1960.083 1960.167 1960.250 1960.333 1960.417 1960.500
## Aug Sep Oct Nov Dec
## 1949 1949.583 1949.667 1949.750 1949.833 1949.917
## 1950 1950.583 1950.667 1950.750 1950.833 1950.917
## 1951 1951.583 1951.667 1951.750 1951.833 1951.917
## 1952 1952.583 1952.667 1952.750 1952.833 1952.917
## 1953 1953.583 1953.667 1953.750 1953.833 1953.917
## 1954 1954.583 1954.667 1954.750 1954.833 1954.917
## 1955 1955.583 1955.667 1955.750 1955.833 1955.917
## 1956 1956.583 1956.667 1956.750 1956.833 1956.917
## 1957 1957.583 1957.667 1957.750 1957.833 1957.917
## 1958 1958.583 1958.667 1958.750 1958.833 1958.917
## 1959 1959.583 1959.667 1959.750 1959.833 1959.917
## 1960 1960.583 1960.667 1960.750 1960.833 1960.917
deltat(AirPassengers)
## [1] 0.08333333
frequency(AirPassengers)
## [1] 12
cycle(AirPassengers)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1949 1 2 3 4 5 6 7 8 9 10 11 12
## 1950 1 2 3 4 5 6 7 8 9 10 11 12
## 1951 1 2 3 4 5 6 7 8 9 10 11 12
## 1952 1 2 3 4 5 6 7 8 9 10 11 12
## 1953 1 2 3 4 5 6 7 8 9 10 11 12
## 1954 1 2 3 4 5 6 7 8 9 10 11 12
## 1955 1 2 3 4 5 6 7 8 9 10 11 12
## 1956 1 2 3 4 5 6 7 8 9 10 11 12
## 1957 1 2 3 4 5 6 7 8 9 10 11 12
## 1958 1 2 3 4 5 6 7 8 9 10 11 12
## 1959 1 2 3 4 5 6 7 8 9 10 11 12
## 1960 1 2 3 4 5 6 7 8 9 10 11 12
# Plot the AirPassengers data
plot(AirPassengers)
# Compute the mean of AirPassengers
mean(AirPassengers, na.rm=TRUE)
## [1] 280.2986
# Impute mean values to NA in AirPassengers
AirPassengers[85:96] <- mean(AirPassengers, na.rm = TRUE)
# Generate another plot of AirPassengers
plot(AirPassengers)
# Add the complete AirPassengers data to your plot
rm(AirPassengers)
points(AirPassengers, type = "l", col = 2, lty = 3)
data_vector <- c( 2.0522 , 4.2929 , 3.3294 , 3.5086 , 0.001 , 1.9217 , 0.7978 , 0.3 , 0.9436 , 0.5748 , -0.0034 , 0.3449 , 2.223 , 0.1763 , 2.7098 , 1.2502 , -0.4007 , 0.8853 , -1.5852 , -2.2829 , -2.561 , -3.126 , -2.866 , -1.7847 , -1.8895 , -2.7255 , -2.1033 , -0.0174 , -0.3613 , -2.9008 , -3.2847 , -2.8685 , -1.9505 , -4.8802 , -3.2635 , -1.6396 , -3.3013 , -2.6331 , -1.7058 , -2.212 , -0.5171 , 0.0753 , -0.8407 , -1.4023 , -0.1382 , -1.4066 , -2.3047 , 1.5074 , 0.7119 , -1.1301 )
# Use print() and plot() to view data_vector
print(data_vector)
## [1] 2.0522 4.2929 3.3294 3.5086 0.0010 1.9217 0.7978 0.3000
## [9] 0.9436 0.5748 -0.0034 0.3449 2.2230 0.1763 2.7098 1.2502
## [17] -0.4007 0.8853 -1.5852 -2.2829 -2.5610 -3.1260 -2.8660 -1.7847
## [25] -1.8895 -2.7255 -2.1033 -0.0174 -0.3613 -2.9008 -3.2847 -2.8685
## [33] -1.9505 -4.8802 -3.2635 -1.6396 -3.3013 -2.6331 -1.7058 -2.2120
## [41] -0.5171 0.0753 -0.8407 -1.4023 -0.1382 -1.4066 -2.3047 1.5074
## [49] 0.7119 -1.1301
plot(data_vector)
# Convert data_vector to a ts object with start = 2004 and frequency = 4
time_series <- ts(data_vector, start=2004, frequency=4)
# Use print() and plot() to view time_series
print(time_series)
## Qtr1 Qtr2 Qtr3 Qtr4
## 2004 2.0522 4.2929 3.3294 3.5086
## 2005 0.0010 1.9217 0.7978 0.3000
## 2006 0.9436 0.5748 -0.0034 0.3449
## 2007 2.2230 0.1763 2.7098 1.2502
## 2008 -0.4007 0.8853 -1.5852 -2.2829
## 2009 -2.5610 -3.1260 -2.8660 -1.7847
## 2010 -1.8895 -2.7255 -2.1033 -0.0174
## 2011 -0.3613 -2.9008 -3.2847 -2.8685
## 2012 -1.9505 -4.8802 -3.2635 -1.6396
## 2013 -3.3013 -2.6331 -1.7058 -2.2120
## 2014 -0.5171 0.0753 -0.8407 -1.4023
## 2015 -0.1382 -1.4066 -2.3047 1.5074
## 2016 0.7119 -1.1301
plot(time_series)
# Check whether data_vector and time_series are ts objects
is.ts(data_vector)
## [1] FALSE
is.ts(time_series)
## [1] TRUE
# Check whether Nile is a ts object
is.ts(Nile)
## [1] TRUE
# Check whether AirPassengers is a ts object
is.ts(AirPassengers)
## [1] TRUE
# DO NOT HAVE eu_stocks - seems to be 1860x4 for 1991/130-1998/169, frequency 260, using DAX, SMI, CAC, FTSE
# Created a smaller mock-up for eu_stocks
numDAX <- c( 1628.8, 1613.6, 1606.5, 1621, 1618.2, 1610.6, 1630.8, 1640.2, 1635.5, 1645.9, 1647.8, 1638.3, 1629.9, 1621.5, 1624.7, 1627.6, 1632, 1621.2, 1613.4, 1605, 1605.8, 1616.7, 1619.3, 1620.5, 1619.7, 1623.1, 1614, 1631.9, 1630.4, 1633.5, 1626.5, 1650.4, 1650.1, 1654.1, 1653.6, 1501.8, 1524.3, 1603.7, 1622.5, 1636.7, 1652.1, 1645.8, 1650.4, 1651.5, 1649.9, 1653.5, 1657.5, 1649.5, 1649.1, 1646.4, 1638.7, 1625.8, 1628.6, 1632.2, 1633.7, 1631.2, 1635.8, 1621.3, 1624.7, 1616.1, 1618.1, 1627.8, 1625.8, 1614.8, 1612.8, 1605.5, 1609.3, 1607.5, 1607.5, 1604.9, 1589.1, 1582.3, 1568, 1568.2, 1569.7, 1571.7, 1585.4, 1570, 1561.9, 1565.2, 1570.3, 1577, 1590.3, 1572.7, 1572.1, 1579.2, 1588.7, 1586, 1579.8, 1572.6, 1568.1, 1578.2, 1573.9, 1582.1, 1610.2, 1605.2, 1623.8, 1615.3, 1627.1, 1627, 1605.7, 1589.7, 1589.7, 1603.3, 1599.8, 1590.9, 1603.5, 1589.9, 1587.9, 1571.1, 1549.8, 1549.4, 1554.7, 1557.5, 1555.3, 1559.8, 1548.4, 1544, 1550.2, 1557, 1551.8, 1562.9, 1570.3, 1559.3, 1545.9, 1542.8, 1542.8, 1542.8, 1542.8, 1564.3, 1577.3, 1577.3, 1577.3, 1598.2, 1604, 1604.7, 1593.7, 1581.7, 1599.1, 1613.8, 1620.5, 1629.5, 1663.7, 1664.1, 1669.3, 1685.1, 1687.1, 1680.1, 1671.8, 1669.5, 1686.7, 1685.5, 1671, 1683.1, 1685.7, 1685.7, 1678.8, 1685.8, 1683.7, 1686.6, 1683.7, 1679.1, 1685, 1680.8, 1676.2, 1688.5, 1696.5, 1690.2, 1711.3, 1711.3, 1729.9, 1716.6, 1743.4, 1745.2, 1746.8, 1749.3, 1763.9, 1762.3, 1762.3, 1746.8, 1753.5, 1753.2, 1739.9, 1723.9, 1734.4, 1723.1, 1732.9, 1729.9, 1725.7, 1730.9, 1714.2, 1716.2, 1719.1, 1718.2, 1698.8, 1714.8, 1718.3, 1706.7, 1723.4, 1716.2, 1738.8, 1737.4, 1714.8, 1724.2, 1733.8, 1730, 1734.5, 1744.3, 1746.9, 1746.9, 1746.9, 1747.5, 1753.1, 1745.2, 1745.7, 1742.9, 1731.7, 1731.2, 1728.1, 1728.1, 1731.3, 1733.8, 1745.8, 1752.6, 1748.1, 1750.7, 1747.9, 1745.8, 1735.3, 1719.9, 1763.6, 1766.8, 1785.4, 1783.6, 1804.4, 1812.3, 1799.5, 1792.8, 1792.8, 1806.4, 1798.2, 1800.6, 1786.2, 1791.3, 1789, 1789, 1784.7, 1789.5, 1779.7, 1787, 1773.2, 1781.6, 1773.8, 1773.8, 1776.3, 1770.7, 1772.4, 1762.5, 1764.3, 1752.8, 1756, 1755, 1759.9, 1759.8, 1776.5, 1770, 1767, 1752.3, 1760.2, 1750.3, 1731.4, 1735.5, 1733.8, 1730.8, 1699.5, 1652.7, 1654.1, 1636.8, 1622.8, 1613.4, 1617.8, 1617.2, 1637.6, 1622.2, 1608.5, 1605.1, 1609.6, 1624.9, 1618.1, 1612, 1579, 1561.4, 1547.9, 1548.6, 1560.2, 1554.8, 1531.9, 1526.1, 1509, 1530, 1485, 1464, 1475.1, 1516.1, 1519.7, 1530, 1516.4, 1515.5, 1543.9, 1534.7, 1538.7, 1536.7, 1523.8, 1527.1, 1530.2, 1601.5, 1580.3, 1595.1, 1579.5, 1600.6, 1566, 1557, 1542.7, 1536.3, 1510.7, 1481, 1483.8, 1470.1, 1484.8, 1475.4, 1402.3, 1421.5, 1434.6, 1446.3, 1437.7, 1441.6, 1471.6, 1454, 1453.8, 1458, 1479.6, 1504.9, 1496.5, 1511, 1528.9, 1534, 1536.6, 1508.2, 1493.5, 1489.7, 1482.4, 1483.3, 1470.6, 1484.8, 1487.7, 1508.6, 1515.3, 1509.8, 1542.3, 1541.8, 1542.5, 1550.3, 1550.3, 1543.4, 1547.8, 1523.6, 1526.7, 1513.4, 1523, 1529.7, 1545.1, 1546.8, 1528.1, 1530.7, 1526.2, 1519.5, 1506.7, 1504.3, 1480.7, 1476.7, 1478.1, 1479.6, 1477.5, 1472.6, 1495.6, 1517.5, 1520.9, 1527.1, 1527.1, 1527.1, 1547.5, 1545.8, 1538.4, 1538.4, 1538.4, 1538, 1554, 1551.2, 1538.4, 1529.1 )
numSMI <- c( 1678.1, 1688.5, 1678.6, 1684.1, 1686.6, 1671.6, 1682.9, 1703.6, 1697.5, 1716.3, 1723.8, 1730.5, 1727.4, 1733.3, 1734, 1728.3, 1737.1, 1723.1, 1723.6, 1719, 1721.2, 1725.3, 1727.2, 1727.2, 1731.6, 1724.1, 1716.9, 1723.4, 1723, 1728.4, 1722.1, 1724.5, 1733.6, 1739, 1726.2, 1587.4, 1630.6, 1685.5, 1701.3, 1718, 1726.2, 1716.6, 1725.8, 1737.4, 1736.6, 1732.4, 1731.2, 1726.9, 1727.8, 1720.2, 1715.4, 1708.7, 1713, 1713.5, 1718, 1701.7, 1701.7, 1684.9, 1687.2, 1690.6, 1684.3, 1679.9, 1672.9, 1663.1, 1669.3, 1664.7, 1672.3, 1687.7, 1686.8, 1686.6, 1675.8, 1677.4, 1673.2, 1665, 1671.3, 1672.4, 1676.2, 1692.6, 1696.5, 1716.1, 1713.3, 1705.1, 1711.3, 1709.8, 1688.6, 1698.9, 1700, 1693, 1683.9, 1679.2, 1673.9, 1683.9, 1688.4, 1693.9, 1720.9, 1717.9, 1733.6, 1729.7, 1735.6, 1734.1, 1699.3, 1678.6, 1675.5, 1670.1, 1652.2, 1635, 1654.9, 1642, 1638.7, 1622.6, 1596.1, 1612.4, 1625, 1610.5, 1606.6, 1610.7, 1603.1, 1591.5, 1605.2, 1621.4, 1622.5, 1626.6, 1627.4, 1614.9, 1602.3, 1598.3, 1627, 1627, 1627, 1655.7, 1670.1, 1670.1, 1670.1, 1670.1, 1704, 1711.8, 1700.5, 1690.3, 1715.4, 1723.5, 1719.4, 1734.4, 1772.8, 1760.3, 1747.2, 1750.2, 1755.3, 1754.6, 1751.2, 1752.5, 1769.4, 1767.6, 1750, 1747.1, 1753.5, 1752.8, 1752.9, 1764.7, 1776.8, 1779.3, 1785.1, 1798.2, 1794.1, 1795.2, 1780.4, 1789.5, 1794.2, 1784.4, 1800.1, 1804, 1816.2, 1810.5, 1821.9, 1828.2, 1840.6, 1841.1, 1846.3, 1850, 1839, 1820.2, 1815.2, 1820.6, 1807.1, 1791.4, 1806.2, 1798.7, 1818.2, 1820.5, 1833.3, 1837.1, 1818.2, 1824.1, 1830.1, 1835.6, 1828.7, 1839.2, 1837.2, 1826.7, 1838, 1829.1, 1843.1, 1850.5, 1827.1, 1829.1, 1848, 1840.5, 1853.8, 1874.1, 1871.3, 1871.3, 1871.3, 1860.5, 1874.7, 1880.1, 1874.7, 1875.6, 1859.5, 1874.2, 1880.1, 1880.1, 1907.7, 1920.5, 1937.3, 1936.8, 1949.1, 1963.7, 1950.8, 1953.5, 1945, 1921.1, 1939.1, 1928, 1933.4, 1925.7, 1931.7, 1928.7, 1924.5, 1914.2, 1914.2, 1920.6, 1923.3, 1930.4, 1915.2, 1916.9, 1913.8, 1913.8, 1899.7, 1888, 1868.8, 1879.9, 1865.7, 1881.3, 1873.1, 1862.5, 1869.3, 1846.9, 1847.1, 1838.3, 1845.8, 1835.5, 1846.6, 1854.8, 1845.3, 1854.5, 1870.5, 1862.6, 1856.6, 1837.6, 1846.7, 1856.5, 1841.8, 1835, 1844.4, 1838.9, 1805.6, 1756.6, 1786.1, 1757.1, 1762.8, 1756.8, 1761.9, 1778.5, 1812.7, 1806.1, 1798.1, 1794.9, 1805.4, 1820.3, 1819.6, 1809.6, 1799.9, 1800.3, 1793.3, 1784.8, 1791.7, 1800.2, 1788.6, 1775.7, 1753.5, 1768.2, 1727.9, 1709.6, 1704.6, 1740.6, 1745.7, 1751.7, 1747.3, 1757.8, 1774.2, 1774.4, 1788.3, 1788, 1779.1, 1792.8, 1812, 1872.1, 1851.4, 1873.4, 1889.6, 1897.5, 1888.8, 1900.4, 1913.4, 1909.9, 1910.8, 1879.2, 1880.2, 1878.3, 1885.2, 1867.6, 1788, 1820.5, 1858.2, 1870.3, 1878.4, 1881.5, 1893.2, 1889.3, 1877.3, 1884, 1904.7, 1922.7, 1908.5, 1911.4, 1921.1, 1930.8, 1927.8, 1908.3, 1905.9, 1911.1, 1921.6, 1933.6, 1942, 1951.5, 1955.7, 1957.4, 1962.3, 1946.1, 1950.2, 1929.7, 1913.4, 1889.5, 1882.8, 1895.4, 1897.9, 1891.5, 1880.1, 1887, 1891.4, 1914.6, 1931.2, 1929.2, 1924.3, 1927, 1935, 1955.4, 1962.2, 1980.7, 1987.7, 1993.7, 2015.7, 2005, 2023.9, 2028.5, 2044.9, 2045.8, 2057.3, 2061.7, 2061.7, 2061.7, 2092.3, 2090.1, 2105.4, 2105.4, 2105.4, 2117.7, 2128.2, 2124.7, 2079.9, 2074.9 )
numCAC <- c( 1772.8, 1750.5, 1718, 1708.1, 1723.1, 1714.3, 1734.5, 1757.4, 1754, 1754.3, 1759.8, 1755.5, 1758.1, 1757.5, 1763.5, 1762.8, 1768.9, 1778.1, 1780.1, 1767.7, 1757.9, 1756.6, 1754.7, 1766.8, 1766.5, 1762.2, 1759.5, 1782.4, 1789.5, 1783.5, 1780.4, 1808.8, 1820.3, 1820.3, 1820.3, 1687.5, 1725.6, 1792.9, 1819.1, 1833.5, 1853.4, 1849.7, 1851.8, 1857.7, 1864.3, 1863.5, 1873.2, 1860.8, 1868.7, 1860.4, 1855.9, 1840.5, 1842.6, 1861.2, 1876.2, 1878.3, 1878.4, 1869.4, 1880.4, 1885.5, 1888.4, 1885.2, 1877.9, 1876.5, 1883.8, 1880.6, 1887.4, 1878.3, 1867.1, 1851.9, 1843.6, 1848.1, 1843.4, 1843.6, 1833.8, 1833.4, 1856.9, 1863.4, 1855.5, 1864.2, 1846, 1836.8, 1830.4, 1831.6, 1834.8, 1852.1, 1849.8, 1861.8, 1856.7, 1856.7, 1841.5, 1846.9, 1836.1, 1838.6, 1857.6, 1857.6, 1858.4, 1846.8, 1868.5, 1863.2, 1808.3, 1765.1, 1763.5, 1766, 1741.3, 1743.3, 1769, 1757.9, 1754.9, 1739.7, 1708.8, 1722.2, 1713.9, 1703.2, 1685.7, 1663.4, 1636.9, 1645.6, 1671.6, 1688.3, 1696.8, 1711.7, 1706.2, 1684.2, 1648.5, 1633.6, 1699.1, 1699.1, 1722.5, 1720.7, 1741.9, 1765.7, 1765.7, 1749.9, 1770.3, 1787.6, 1778.7, 1785.6, 1833.9, 1837.4, 1824.3, 1843.8, 1873.6, 1860.2, 1860.2, 1865.9, 1867.9, 1841.3, 1838.7, 1849.9, 1869.3, 1890.6, 1879.6, 1873.9, 1875.3, 1857, 1856.5, 1865.8, 1860.6, 1861.6, 1865.6, 1864.1, 1861.6, 1876.5, 1865.1, 1882.1, 1912.2, 1915.4, 1951.2, 1962.4, 1976.5, 1953.5, 1981.3, 1985.1, 1983.4, 1979.7, 1983.8, 1988.1, 1973, 1966.9, 1976.3, 1993.9, 1968, 1941.8, 1947.1, 1929.2, 1943.6, 1928.2, 1922, 1919.1, 1884.6, 1896.3, 1928.3, 1934.8, 1923.5, 1943.8, 1942.4, 1928.1, 1942, 1942.7, 1974.8, 1975.4, 1907.5, 1943.6, 1974.1, 1963.3, 1972.3, 1990.7, 1978.2, 1978.2, 1978.2, 1980.4, 1983.7, 1978.1, 1984.9, 1995.7, 2006.6, 2036.7, 2031.1, 2031.1, 2041.6, 2046.9, 2047.2, 2063.4, 2063.4, 2077.5, 2063.6, 2053.2, 2017, 2024, 2051.6, 2023.1, 2030.8, 2016.8, 2045.1, 2046.3, 2029.6, 2014.1, 2014.1, 2033.3, 2017.4, 2024.9, 1992.6, 1994.9, 1981.6, 1981.6, 1962.2, 1953.7, 1928.8, 1928.3, 1918.1, 1931.4, 1908.8, 1891.8, 1913.9, 1885.8, 1895.8, 1899.6, 1920.3, 1915.3, 1907.3, 1900.6, 1880.9, 1873.5, 1883.6, 1868.5, 1879.1, 1847.8, 1861.8, 1859.4, 1859.4, 1859.4, 1853.3, 1851.2, 1801.8, 1767.9, 1762.7, 1727.5, 1734.6, 1734.6, 1755.4, 1769, 1801.6, 1782.6, 1754.7, 1784.4, 1787.6, 1798, 1793.8, 1777.3, 1755.2, 1737.8, 1730.1, 1722.4, 1753.5, 1757.3, 1736.7, 1734.2, 1724.2, 1744.2, 1689.7, 1667.7, 1667.8, 1687.6, 1687.5, 1684.9, 1674.2, 1711.4, 1780.5, 1779, 1779.3, 1763.7, 1756.8, 1774.2, 1802, 1873.6, 1836.2, 1859.8, 1852.7, 1882.9, 1826.1, 1832.8, 1828.9, 1829.5, 1843.5, 1770.3, 1731.9, 1736.7, 1724, 1683.3, 1611, 1612.5, 1654.2, 1673.9, 1657.3, 1655.1, 1685.1, 1667.9, 1650, 1664.2, 1679.1, 1731.3, 1722.2, 1730.7, 1766.4, 1770.7, 1774.5, 1749.9, 1730.9, 1742.4, 1742.4, 1786.9, 1804.1, 1804.7, 1793.6, 1786.7, 1798.5, 1798.5, 1821.5, 1796.8, 1772.7, 1764.4, 1759.2, 1722.3, 1724.2, 1674.8, 1720.6, 1721, 1739.7, 1749.7, 1771.4, 1792.3, 1783.3, 1799.4, 1781.7, 1788.6, 1765.9, 1791.2, 1769.5, 1758.7, 1738.3, 1744.8, 1736.7, 1735.2, 1760.1, 1786.3, 1824.4, 1821.1, 1854.6, 1854.6, 1857.5, 1870.3, 1858.8, 1857.8, 1857.8, 1843.1, 1850.8, 1859.6, 1844.5, 1852.6 )
numFTSE <- c( 2443.6, 2460.2, 2448.2, 2470.4, 2484.7, 2466.8, 2487.9, 2508.4, 2510.5, 2497.4, 2532.5, 2556.8, 2561, 2547.3, 2541.5, 2558.5, 2587.9, 2580.5, 2579.6, 2589.3, 2595, 2595.6, 2588.8, 2591.7, 2601.7, 2585.4, 2573.3, 2597.4, 2600.6, 2570.6, 2569.4, 2584.9, 2608.8, 2617.2, 2621, 2540.5, 2554.5, 2601.9, 2623, 2640.7, 2640.7, 2619.8, 2624.2, 2638.2, 2645.7, 2679.6, 2669, 2664.6, 2663.3, 2667.4, 2653.2, 2630.8, 2626.6, 2641.9, 2625.8, 2606, 2594.4, 2583.6, 2588.7, 2600.3, 2579.5, 2576.6, 2597.8, 2595.6, 2599, 2621.7, 2645.6, 2644.2, 2625.6, 2624.6, 2596.2, 2599.5, 2584.1, 2570.8, 2555, 2574.5, 2576.7, 2579, 2588.7, 2601.1, 2575.7, 2559.5, 2561.1, 2528.3, 2514.7, 2558.5, 2553.3, 2577.1, 2566, 2549.5, 2527.8, 2540.9, 2534.2, 2538, 2559, 2554.9, 2575.5, 2546.5, 2561.6, 2546.6, 2502.9, 2463.1, 2472.6, 2463.5, 2446.3, 2456.2, 2471.5, 2447.5, 2428.6, 2420.2, 2414.9, 2420.2, 2423.8, 2407, 2388.7, 2409.6, 2392, 2380.2, 2423.3, 2451.6, 2440.8, 2432.9, 2413.6, 2391.6, 2358.1, 2345.4, 2384.4, 2384.4, 2384.4, 2418.7, 2420, 2493.1, 2493.1, 2492.8, 2504.1, 2493.2, 2482.9, 2467.1, 2497.9, 2477.9, 2490.1, 2516.3, 2537.1, 2541.6, 2536.7, 2544.9, 2543.4, 2522, 2525.3, 2510.4, 2539.9, 2552, 2546.5, 2550.8, 2571.2, 2560.2, 2556.8, 2547.1, 2534.3, 2517.2, 2538.4, 2537.1, 2523.7, 2522.6, 2513.9, 2541, 2555.9, 2536.7, 2543.4, 2542.3, 2559.7, 2546.8, 2565, 2562, 2562.1, 2554.3, 2565.4, 2558.4, 2538.3, 2533.1, 2550.7, 2574.8, 2522.4, 2493.3, 2476, 2470.7, 2491.2, 2464.7, 2467.6, 2456.6, 2441, 2458.7, 2464.9, 2472.2, 2447.9, 2452.9, 2440.1, 2408.6, 2405.4, 2382.7, 2400.9, 2404.2, 2393.2, 2436.4, 2572.6, 2591, 2600.5, 2640.2, 2638.6, 2638.6, 2638.6, 2625.8, 2607.8, 2609.8, 2643, 2658.2, 2651, 2664.9, 2654.1, 2659.8, 2659.8, 2662.2, 2698.7, 2701.9, 2725.7, 2737.8, 2722.4, 2720.5, 2694.7, 2682.6, 2703.6, 2700.6, 2711.9, 2702, 2715, 2715, 2704.6, 2698.6, 2694.2, 2707.6, 2697.6, 2705.9, 2680.9, 2681.9, 2668.5, 2645.8, 2635.4, 2636.1, 2614.1, 2603.7, 2593.6, 2616.3, 2598.4, 2562.7, 2584.8, 2550.3, 2560.6, 2532.6, 2557.3, 2534.1, 2515.8, 2521.2, 2493.9, 2476.1, 2497.1, 2469, 2493.7, 2472.6, 2497.9, 2490.8, 2478.3, 2484, 2486.4, 2483.4, 2431.9, 2403.7, 2415.6, 2387.9, 2399.5, 2377.2, 2348, 2373.4, 2423.2, 2411.6, 2399.6, 2420.2, 2407.5, 2392.8, 2377.6, 2350.1, 2325.7, 2309.6, 2303.1, 2318, 2356.8, 2376.1, 2354.7, 2363.5, 2359.4, 2365.7, 2311.1, 2281, 2285, 2311.6, 2312.6, 2312.6, 2298.4, 2313, 2381.9, 2362.2, 2372.2, 2337.7, 2327.5, 2340.6, 2370.9, 2422.1, 2370, 2378.3, 2483.9, 2567, 2560.1, 2586, 2580.5, 2621.2, 2601, 2560, 2565.5, 2553, 2572.3, 2549.7, 2446.3, 2488.4, 2517.1, 2538.8, 2541.2, 2557.2, 2584.7, 2574.7, 2546.6, 2563.9, 2562.2, 2617, 2645.7, 2658.1, 2669.7, 2661.6, 2669.8, 2650.4, 2642.3, 2658.3, 2687.8, 2705.6, 2691.7, 2711.1, 2702.7, 2695.4, 2714.6, 2696.8, 2726.4, 2697.5, 2679.6, 2679.2, 2704, 2706.2, 2732.4, 2722.9, 2727.1, 2709.6, 2741.8, 2760.1, 2778.8, 2792, 2764.1, 2771, 2759.4, 2754.5, 2769.8, 2750.7, 2726.5, 2716.2, 2721.8, 2717.9, 2732.8, 2740.3, 2789.7, 2807.7, 2842, 2827.4, 2827.5, 2827.5, 2827.5, 2847.8, 2832.5, 2846.5, 2846.5, 2861.5, 2833.6, 2826, 2816.5, 2799.2 )
mtxEU <- matrix(data=c(numDAX, numSMI, numCAC, numFTSE), ncol=4, byrow=FALSE)
colnames(mtxEU) <- c("DAX", "SMI", "CAC", "FTSE")
eu_stocks <- ts(data=mtxEU, start=c(1991, 130), frequency=260)
str(eu_stocks)
## mts [1:400, 1:4] 1629 1614 1606 1621 1618 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "DAX" "SMI" "CAC" "FTSE"
## - attr(*, "tsp")= num [1:3] 1991 1993 260
## - attr(*, "class")= chr [1:3] "mts" "ts" "matrix"
# Check whether eu_stocks is a ts object
is.ts(eu_stocks)
## [1] TRUE
# View the start, end, and frequency of eu_stocks
start(eu_stocks)
## [1] 1991 130
end(eu_stocks)
## [1] 1993 9
frequency(eu_stocks)
## [1] 260
# Generate a simple plot of eu_stocks
plot(eu_stocks)
# Use ts.plot with eu_stocks
ts.plot(eu_stocks, col = 1:4, xlab = "Year", ylab = "Index Value",
main = "Major European Stock Indices, 1991-1998"
)
# Add a legend to your ts.plot
legend("topleft", colnames(eu_stocks), lty = 1, col = 1:4, bty = "n")
Chapter 2 - Predicting the Future
Trend spotting - clear trends over time - many time series have some trends to the data:
White Noise (WN) model - simplest example of a stationary process (fixed constant mean, fixed constant variance, no correlation over time):
Random Walk (RW) model - simple example of a non-stationary process with no specified mean or variance, but with strong dependence over time:
Stationary Process - assumptions of stationary models help with parsimony and distributional stability:
Example code includes:
rapid_growth <- c( 506 , 447.4 , 542.6 , 516.1 , 507 , 535 , 496.9 , 497.6 , 577.2 , 536.9 , 541.2 , 473.5 , 551 , 569.4 , 522.9 , 487.2 , 594.6 , 591.2 , 616 , 621.3 , 607.1 , 587 , 554.2 , 644.1 , 509.7 , 607.1 , 603.6 , 613.6 , 544.9 , 670.8 , 687.1 , 615.6 , 711.2 , 694.3 , 681.9 , 659.1 , 642.7 , 601.5 , 666.8 , 651 , 606.1 , 696.7 , 641.6 , 855.8 , 667.3 , 573.5 , 791.7 , 751.6 , 610.8 , 624.7 , 833.3 , 639.9 , 736.8 , 772.3 , 686.9 , 667.8 , 712.9 , 918.2 , 656.1 , 700.5 , 683.5 , 781.7 , 715.7 , 808.3 , 820.8 , 656.9 , 733.3 , 773.5 , 641.2 , 932.2 , 680.7 , 988.3 , 664.9 , 813.5 , 883.4 , 924.3 , 969.4 , 777.3 , 881 , 971.4 , 903 , 1020.7 , 1075.1 , 886.2 , 889.6 , 950.4 , 878 , 1043.8 , 901.1 , 1079.7 , 933.9 , 921.9 , 870.8 , 811.1 , 1004.3 , 1008.2 , 1189.5 , 752 , 947.5 , 886.5 , 1074.9 , 1101.1 , 1130.2 , 975.8 , 948.2 , 1177.8 , 1227.1 , 977 , 836.7 , 1323.6 , 852.4 , 1200.8 , 1274.5 , 1349.3 , 1102.6 , 1324.9 , 1268.7 , 1058.2 , 1204.1 , 1084.7 , 1284.4 , 1195.3 , 1058.4 , 1188.1 , 1166.6 , 1064.7 , 1429.1 , 1070.9 , 1539.3 , 1467.2 , 1127.7 , 1296.1 , 1555.3 , 1332.9 , 1315.4 , 1189.2 , 1482.4 , 1240.9 , 1237.8 , 1468.6 , 1328.5 , 1589.5 , 1373.2 , 1503.6 , 1659.9 , 1704.6 , 1550.5 , 1625.8 , 1873.9 , 1370.6 , 1439.7 , 1447.4 , 1579.9 , 1681.3 , 1661.6 , 1311.8 , 1326 , 1323.1 , 1550.5 , 1606.2 , 1768.5 , 1509.8 , 1592.1 , 1627.6 , 1544.6 , 1439.5 , 1682.4 , 1850.7 , 1673.4 , 1832.4 , 1672.3 , 1781.6 , 1659.3 , 1970 , 2044.7 , 1929.1 , 1891.7 , 1487.2 , 2013.9 , 1796.8 , 1977 , 1517 , 1650.6 , 1523.3 , 1696.6 , 1627.3 , 1787.3 , 1567.3 , 1882 , 2319 , 1942 , 1820.3 , 2154.8 , 2261.5 , 2052.2 , 2079.2 , 2010.1 , 2145.3 , 1775.3 , 2013.4 )
# Log rapid_growth
linear_growth <- log(rapid_growth)
# Plot linear_growth using ts.plot()
ts.plot(linear_growth)
z <- c( 6.23 , 6.1 , 6.3 , 6.25 , 6.23 , 6.28 , 6.21 , 6.21 , 6.36 , 6.29 , 6.29 , 6.16 , 6.31 , 6.34 , 6.26 , 6.19 , 6.39 , 6.38 , 6.42 , 6.43 , 6.41 , 6.38 , 6.32 , 6.47 , 6.23 , 6.41 , 6.4 , 6.42 , 6.3 , 6.51 , 6.53 , 6.42 , 6.57 , 6.54 , 6.52 , 6.49 , 6.47 , 6.4 , 6.5 , 6.48 , 6.41 , 6.55 , 6.46 , 6.75 , 6.5 , 6.35 , 6.67 , 6.62 , 6.41 , 6.44 , 6.73 , 6.46 , 6.6 , 6.65 , 6.53 , 6.5 , 6.57 , 6.82 , 6.49 , 6.55 , 6.53 , 6.66 , 6.57 , 6.69 , 6.71 , 6.49 , 6.6 , 6.65 , 6.46 , 6.84 , 6.52 , 6.9 , 6.5 , 6.7 , 6.78 , 6.83 , 6.88 , 6.66 , 6.78 , 6.88 , 6.81 , 6.93 , 6.98 , 6.79 , 6.79 , 6.86 , 6.78 , 6.95 , 6.8 , 6.98 , 6.84 , 6.83 , 6.77 , 6.7 , 6.91 , 6.92 , 7.08 , 6.62 , 6.85 , 6.79 , 6.98 , 7 , 7.03 , 6.88 , 6.85 , 7.07 , 7.11 , 6.88 , 6.73 , 7.19 , 6.75 , 7.09 , 7.15 , 7.21 , 7.01 , 7.19 , 7.15 , 6.96 , 7.09 , 6.99 , 7.16 , 7.09 , 6.96 , 7.08 , 7.06 , 6.97 , 7.26 , 6.98 , 7.34 , 7.29 , 7.03 , 7.17 , 7.35 , 7.2 , 7.18 , 7.08 , 7.3 , 7.12 , 7.12 , 7.29 , 7.19 , 7.37 , 7.22 , 7.32 , 7.41 , 7.44 , 7.35 , 7.39 , 7.54 , 7.22 , 7.27 , 7.28 , 7.37 , 7.43 , 7.42 , 7.18 , 7.19 , 7.19 , 7.35 , 7.38 , 7.48 , 7.32 , 7.37 , 7.39 , 7.34 , 7.27 , 7.43 , 7.52 , 7.42 , 7.51 , 7.42 , 7.49 , 7.41 , 7.59 , 7.62 , 7.56 , 7.55 , 7.3 , 7.61 , 7.49 , 7.59 , 7.32 , 7.41 , 7.33 , 7.44 , 7.39 , 7.49 , 7.36 , 7.54 , 7.75 , 7.57 , 7.51 , 7.68 , 7.72 , 7.63 , 7.64 , 7.61 , 7.67 , 7.48 , 7.61 )
# Generate the first difference of z
dz <- diff(z)
# Plot dz
ts.plot(dz)
# View the length of z and dz, respectively
length(z)
## [1] 200
length(dz)
## [1] 199
x <- c( -4.2 , 9.57 , 5.18 , -9.69 , -3.22 , 10.84 , 6.45 , -10.83 , -2.24 , 10.12 , 6.58 , -8.66 , -2.52 , 9.84 , 7.39 , -8.24 , -4.26 , 8.9 , 8.54 , -8.07 , -4.02 , 9.82 , 7.77 , -6.59 , -3.46 , 10.61 , 7.37 , -5.8 , -1.2 , 11.43 , 7.57 , -4.97 , -2 , 11.94 , 9.41 , -4.4 , -1.56 , 12.6 , 8.5 , -3.73 , -2.83 , 13.38 , 8.13 , -3.15 , -2.8 , 13.71 , 6.76 , -3.78 , -3.77 , 13.63 , 6.54 , -3.25 , -5.02 , 13.36 , 6.93 , -3.53 , -5.2 , 11.58 , 7.16 , -1.89 , -5.78 , 12.48 , 6.21 , -3.43 , -7.08 , 11.41 , 6.74 , -3.53 , -8.39 , 12.51 , 6.47 , -3.75 , -9.43 , 12.38 , 8.05 , -2.83 , -7.3 , 12.77 , 8.22 , -4.45 , -6.96 , 12.03 , 7.57 , -5.4 , -6.57 , 10.9 , 7.28 , -4.04 , -6.72 , 12.18 , 8.29 , -4.16 , -6.36 , 12.75 , 8.67 , -5.44 , -4.87 , 12.6 , 8.16 , -6.54 )
# Generate a diff of x with lag = 4. Save this to dx
dx <- diff(x, lag=4)
# Plot dx
ts.plot(dx)
# View the length of x and dx, respectively
length(x)
## [1] 100
length(dx)
## [1] 96
# Simulate a WN model with list(order = c(0, 0, 0))
white_noise <- arima.sim(model = list(order=c(0, 0, 0)), n = 100)
# Plot your white_noise data
ts.plot(white_noise)
# Simulate from the WN model with: mean = 100, sd = 10
white_noise_2 <- arima.sim(model = list(order=c(0, 0, 0)), n = 100, mean = 100, sd = 10)
# Plot your white_noise_2 data
ts.plot(white_noise_2)
# Fit the WN model to y using the arima command
arima(white_noise_2, order=c(0, 0, 0))
##
## Call:
## arima(x = white_noise_2, order = c(0, 0, 0))
##
## Coefficients:
## intercept
## 99.6183
## s.e. 0.8262
##
## sigma^2 estimated as 68.26: log likelihood = -353.06, aic = 710.12
# Calculate the sample mean and sample variance of y
mean(white_noise_2)
## [1] 99.61835
var(white_noise_2)
## [1] 68.9478
# Generate a RW model using arima.sim
random_walk <- arima.sim(model = list(order=c(0, 1, 0)), n = 100)
# Plot random_walk
ts.plot(random_walk)
# Calculate the first difference series
random_walk_diff <- diff(random_walk)
# Plot random_walk_diff
ts.plot(random_walk_diff)
# Generate a RW model with a drift uing arima.sim
rw_drift <- arima.sim(model = list(order=c(0, 1, 0)), n = 100, mean = 1)
# Plot rw_drift
ts.plot(rw_drift)
# Calculate the first difference series
rw_drift_diff <- diff(rw_drift)
# Plot rw_drift_diff
ts.plot(rw_drift_diff)
# Difference your random_walk data
rw_diff <- diff(random_walk)
# Plot rw_diff
ts.plot(rw_diff)
# Now fit the WN model to the differenced data
model_wn <-arima(rw_diff, order=c(0, 0, 0))
# Store the value of the estimated time trend (intercept)
int_wn <- model_wn$coef
# Plot the original random_walk data
ts.plot(random_walk)
# Use abline(0, ...) to add time trend to the figure
abline(0, int_wn)
# Use arima.sim() to generate WN data
white_noise <- arima.sim(model=list(order=c(0, 0, 0)), n=100)
# Use cumsum() to convert your WN data to RW
random_walk <- cumsum(white_noise)
# Use arima.sim() to generate WN drift data
wn_drift <- arima.sim(model=list(order=c(0, 0, 0)), n=100, mean=0.4)
# Use cumsum() to convert your WN drift data to RW
rw_drift <- cumsum(wn_drift)
# Plot all four data objects
plot.ts(cbind(white_noise, random_walk, wn_drift, rw_drift))
Chapter 3 - Correlation Analysis
Scatterplots can be created using ts.plot, including ts.plot(cbind(a, b, .)) to have multiple plots on the same scale:
Covariance and Correlation - running cov(a, b) and cor(a, b):
Autocorrelation - how strongly is each observation related to its recent past?
Example code includes:
# Make a dummy eu_stocks, but shorter than the actual 1860x4
numDAX <- c( 1628.8, 1613.6, 1606.5, 1621, 1618.2, 1610.6, 1630.8, 1640.2, 1635.5, 1645.9, 1647.8, 1638.3, 1629.9, 1621.5, 1624.7, 1627.6, 1632, 1621.2, 1613.4, 1605, 1605.8, 1616.7, 1619.3, 1620.5, 1619.7, 1623.1, 1614, 1631.9, 1630.4, 1633.5, 1626.5, 1650.4, 1650.1, 1654.1, 1653.6, 1501.8, 1524.3, 1603.7, 1622.5, 1636.7, 1652.1, 1645.8, 1650.4, 1651.5, 1649.9, 1653.5, 1657.5, 1649.5, 1649.1, 1646.4, 1638.7, 1625.8, 1628.6, 1632.2, 1633.7, 1631.2, 1635.8, 1621.3, 1624.7, 1616.1, 1618.1, 1627.8, 1625.8, 1614.8, 1612.8, 1605.5, 1609.3, 1607.5, 1607.5, 1604.9, 1589.1, 1582.3, 1568, 1568.2, 1569.7, 1571.7, 1585.4, 1570, 1561.9, 1565.2, 1570.3, 1577, 1590.3, 1572.7, 1572.1, 1579.2, 1588.7, 1586, 1579.8, 1572.6, 1568.1, 1578.2, 1573.9, 1582.1, 1610.2, 1605.2, 1623.8, 1615.3, 1627.1, 1627, 1605.7, 1589.7, 1589.7, 1603.3, 1599.8, 1590.9, 1603.5, 1589.9, 1587.9, 1571.1, 1549.8, 1549.4, 1554.7, 1557.5, 1555.3, 1559.8, 1548.4, 1544, 1550.2, 1557, 1551.8, 1562.9, 1570.3, 1559.3, 1545.9, 1542.8, 1542.8, 1542.8, 1542.8, 1564.3, 1577.3, 1577.3, 1577.3, 1598.2, 1604, 1604.7, 1593.7, 1581.7, 1599.1, 1613.8, 1620.5, 1629.5, 1663.7, 1664.1, 1669.3, 1685.1, 1687.1, 1680.1, 1671.8, 1669.5, 1686.7, 1685.5, 1671, 1683.1, 1685.7, 1685.7, 1678.8, 1685.8, 1683.7, 1686.6, 1683.7, 1679.1, 1685, 1680.8, 1676.2, 1688.5, 1696.5, 1690.2, 1711.3, 1711.3, 1729.9, 1716.6, 1743.4, 1745.2, 1746.8, 1749.3, 1763.9, 1762.3, 1762.3, 1746.8, 1753.5, 1753.2, 1739.9, 1723.9, 1734.4, 1723.1, 1732.9, 1729.9, 1725.7, 1730.9, 1714.2, 1716.2, 1719.1, 1718.2, 1698.8, 1714.8, 1718.3, 1706.7, 1723.4, 1716.2, 1738.8, 1737.4, 1714.8, 1724.2, 1733.8, 1730, 1734.5, 1744.3, 1746.9, 1746.9, 1746.9, 1747.5, 1753.1, 1745.2, 1745.7, 1742.9, 1731.7, 1731.2, 1728.1, 1728.1, 1731.3, 1733.8, 1745.8, 1752.6, 1748.1, 1750.7, 1747.9, 1745.8, 1735.3, 1719.9, 1763.6, 1766.8, 1785.4, 1783.6, 1804.4, 1812.3, 1799.5, 1792.8, 1792.8, 1806.4, 1798.2, 1800.6, 1786.2, 1791.3, 1789, 1789, 1784.7, 1789.5, 1779.7, 1787, 1773.2, 1781.6, 1773.8, 1773.8, 1776.3, 1770.7, 1772.4, 1762.5, 1764.3, 1752.8, 1756, 1755, 1759.9, 1759.8, 1776.5, 1770, 1767, 1752.3, 1760.2, 1750.3, 1731.4, 1735.5, 1733.8, 1730.8, 1699.5, 1652.7, 1654.1, 1636.8, 1622.8, 1613.4, 1617.8, 1617.2, 1637.6, 1622.2, 1608.5, 1605.1, 1609.6, 1624.9, 1618.1, 1612, 1579, 1561.4, 1547.9, 1548.6, 1560.2, 1554.8, 1531.9, 1526.1, 1509, 1530, 1485, 1464, 1475.1, 1516.1, 1519.7, 1530, 1516.4, 1515.5, 1543.9, 1534.7, 1538.7, 1536.7, 1523.8, 1527.1, 1530.2, 1601.5, 1580.3, 1595.1, 1579.5, 1600.6, 1566, 1557, 1542.7, 1536.3, 1510.7, 1481, 1483.8, 1470.1, 1484.8, 1475.4, 1402.3, 1421.5, 1434.6, 1446.3, 1437.7, 1441.6, 1471.6, 1454, 1453.8, 1458, 1479.6, 1504.9, 1496.5, 1511, 1528.9, 1534, 1536.6, 1508.2, 1493.5, 1489.7, 1482.4, 1483.3, 1470.6, 1484.8, 1487.7, 1508.6, 1515.3, 1509.8, 1542.3, 1541.8, 1542.5, 1550.3, 1550.3, 1543.4, 1547.8, 1523.6, 1526.7, 1513.4, 1523, 1529.7, 1545.1, 1546.8, 1528.1, 1530.7, 1526.2, 1519.5, 1506.7, 1504.3, 1480.7, 1476.7, 1478.1, 1479.6, 1477.5, 1472.6, 1495.6, 1517.5, 1520.9, 1527.1, 1527.1, 1527.1, 1547.5, 1545.8, 1538.4, 1538.4, 1538.4, 1538, 1554, 1551.2, 1538.4, 1529.1 )
numSMI <- c( 1678.1, 1688.5, 1678.6, 1684.1, 1686.6, 1671.6, 1682.9, 1703.6, 1697.5, 1716.3, 1723.8, 1730.5, 1727.4, 1733.3, 1734, 1728.3, 1737.1, 1723.1, 1723.6, 1719, 1721.2, 1725.3, 1727.2, 1727.2, 1731.6, 1724.1, 1716.9, 1723.4, 1723, 1728.4, 1722.1, 1724.5, 1733.6, 1739, 1726.2, 1587.4, 1630.6, 1685.5, 1701.3, 1718, 1726.2, 1716.6, 1725.8, 1737.4, 1736.6, 1732.4, 1731.2, 1726.9, 1727.8, 1720.2, 1715.4, 1708.7, 1713, 1713.5, 1718, 1701.7, 1701.7, 1684.9, 1687.2, 1690.6, 1684.3, 1679.9, 1672.9, 1663.1, 1669.3, 1664.7, 1672.3, 1687.7, 1686.8, 1686.6, 1675.8, 1677.4, 1673.2, 1665, 1671.3, 1672.4, 1676.2, 1692.6, 1696.5, 1716.1, 1713.3, 1705.1, 1711.3, 1709.8, 1688.6, 1698.9, 1700, 1693, 1683.9, 1679.2, 1673.9, 1683.9, 1688.4, 1693.9, 1720.9, 1717.9, 1733.6, 1729.7, 1735.6, 1734.1, 1699.3, 1678.6, 1675.5, 1670.1, 1652.2, 1635, 1654.9, 1642, 1638.7, 1622.6, 1596.1, 1612.4, 1625, 1610.5, 1606.6, 1610.7, 1603.1, 1591.5, 1605.2, 1621.4, 1622.5, 1626.6, 1627.4, 1614.9, 1602.3, 1598.3, 1627, 1627, 1627, 1655.7, 1670.1, 1670.1, 1670.1, 1670.1, 1704, 1711.8, 1700.5, 1690.3, 1715.4, 1723.5, 1719.4, 1734.4, 1772.8, 1760.3, 1747.2, 1750.2, 1755.3, 1754.6, 1751.2, 1752.5, 1769.4, 1767.6, 1750, 1747.1, 1753.5, 1752.8, 1752.9, 1764.7, 1776.8, 1779.3, 1785.1, 1798.2, 1794.1, 1795.2, 1780.4, 1789.5, 1794.2, 1784.4, 1800.1, 1804, 1816.2, 1810.5, 1821.9, 1828.2, 1840.6, 1841.1, 1846.3, 1850, 1839, 1820.2, 1815.2, 1820.6, 1807.1, 1791.4, 1806.2, 1798.7, 1818.2, 1820.5, 1833.3, 1837.1, 1818.2, 1824.1, 1830.1, 1835.6, 1828.7, 1839.2, 1837.2, 1826.7, 1838, 1829.1, 1843.1, 1850.5, 1827.1, 1829.1, 1848, 1840.5, 1853.8, 1874.1, 1871.3, 1871.3, 1871.3, 1860.5, 1874.7, 1880.1, 1874.7, 1875.6, 1859.5, 1874.2, 1880.1, 1880.1, 1907.7, 1920.5, 1937.3, 1936.8, 1949.1, 1963.7, 1950.8, 1953.5, 1945, 1921.1, 1939.1, 1928, 1933.4, 1925.7, 1931.7, 1928.7, 1924.5, 1914.2, 1914.2, 1920.6, 1923.3, 1930.4, 1915.2, 1916.9, 1913.8, 1913.8, 1899.7, 1888, 1868.8, 1879.9, 1865.7, 1881.3, 1873.1, 1862.5, 1869.3, 1846.9, 1847.1, 1838.3, 1845.8, 1835.5, 1846.6, 1854.8, 1845.3, 1854.5, 1870.5, 1862.6, 1856.6, 1837.6, 1846.7, 1856.5, 1841.8, 1835, 1844.4, 1838.9, 1805.6, 1756.6, 1786.1, 1757.1, 1762.8, 1756.8, 1761.9, 1778.5, 1812.7, 1806.1, 1798.1, 1794.9, 1805.4, 1820.3, 1819.6, 1809.6, 1799.9, 1800.3, 1793.3, 1784.8, 1791.7, 1800.2, 1788.6, 1775.7, 1753.5, 1768.2, 1727.9, 1709.6, 1704.6, 1740.6, 1745.7, 1751.7, 1747.3, 1757.8, 1774.2, 1774.4, 1788.3, 1788, 1779.1, 1792.8, 1812, 1872.1, 1851.4, 1873.4, 1889.6, 1897.5, 1888.8, 1900.4, 1913.4, 1909.9, 1910.8, 1879.2, 1880.2, 1878.3, 1885.2, 1867.6, 1788, 1820.5, 1858.2, 1870.3, 1878.4, 1881.5, 1893.2, 1889.3, 1877.3, 1884, 1904.7, 1922.7, 1908.5, 1911.4, 1921.1, 1930.8, 1927.8, 1908.3, 1905.9, 1911.1, 1921.6, 1933.6, 1942, 1951.5, 1955.7, 1957.4, 1962.3, 1946.1, 1950.2, 1929.7, 1913.4, 1889.5, 1882.8, 1895.4, 1897.9, 1891.5, 1880.1, 1887, 1891.4, 1914.6, 1931.2, 1929.2, 1924.3, 1927, 1935, 1955.4, 1962.2, 1980.7, 1987.7, 1993.7, 2015.7, 2005, 2023.9, 2028.5, 2044.9, 2045.8, 2057.3, 2061.7, 2061.7, 2061.7, 2092.3, 2090.1, 2105.4, 2105.4, 2105.4, 2117.7, 2128.2, 2124.7, 2079.9, 2074.9 )
numCAC <- c( 1772.8, 1750.5, 1718, 1708.1, 1723.1, 1714.3, 1734.5, 1757.4, 1754, 1754.3, 1759.8, 1755.5, 1758.1, 1757.5, 1763.5, 1762.8, 1768.9, 1778.1, 1780.1, 1767.7, 1757.9, 1756.6, 1754.7, 1766.8, 1766.5, 1762.2, 1759.5, 1782.4, 1789.5, 1783.5, 1780.4, 1808.8, 1820.3, 1820.3, 1820.3, 1687.5, 1725.6, 1792.9, 1819.1, 1833.5, 1853.4, 1849.7, 1851.8, 1857.7, 1864.3, 1863.5, 1873.2, 1860.8, 1868.7, 1860.4, 1855.9, 1840.5, 1842.6, 1861.2, 1876.2, 1878.3, 1878.4, 1869.4, 1880.4, 1885.5, 1888.4, 1885.2, 1877.9, 1876.5, 1883.8, 1880.6, 1887.4, 1878.3, 1867.1, 1851.9, 1843.6, 1848.1, 1843.4, 1843.6, 1833.8, 1833.4, 1856.9, 1863.4, 1855.5, 1864.2, 1846, 1836.8, 1830.4, 1831.6, 1834.8, 1852.1, 1849.8, 1861.8, 1856.7, 1856.7, 1841.5, 1846.9, 1836.1, 1838.6, 1857.6, 1857.6, 1858.4, 1846.8, 1868.5, 1863.2, 1808.3, 1765.1, 1763.5, 1766, 1741.3, 1743.3, 1769, 1757.9, 1754.9, 1739.7, 1708.8, 1722.2, 1713.9, 1703.2, 1685.7, 1663.4, 1636.9, 1645.6, 1671.6, 1688.3, 1696.8, 1711.7, 1706.2, 1684.2, 1648.5, 1633.6, 1699.1, 1699.1, 1722.5, 1720.7, 1741.9, 1765.7, 1765.7, 1749.9, 1770.3, 1787.6, 1778.7, 1785.6, 1833.9, 1837.4, 1824.3, 1843.8, 1873.6, 1860.2, 1860.2, 1865.9, 1867.9, 1841.3, 1838.7, 1849.9, 1869.3, 1890.6, 1879.6, 1873.9, 1875.3, 1857, 1856.5, 1865.8, 1860.6, 1861.6, 1865.6, 1864.1, 1861.6, 1876.5, 1865.1, 1882.1, 1912.2, 1915.4, 1951.2, 1962.4, 1976.5, 1953.5, 1981.3, 1985.1, 1983.4, 1979.7, 1983.8, 1988.1, 1973, 1966.9, 1976.3, 1993.9, 1968, 1941.8, 1947.1, 1929.2, 1943.6, 1928.2, 1922, 1919.1, 1884.6, 1896.3, 1928.3, 1934.8, 1923.5, 1943.8, 1942.4, 1928.1, 1942, 1942.7, 1974.8, 1975.4, 1907.5, 1943.6, 1974.1, 1963.3, 1972.3, 1990.7, 1978.2, 1978.2, 1978.2, 1980.4, 1983.7, 1978.1, 1984.9, 1995.7, 2006.6, 2036.7, 2031.1, 2031.1, 2041.6, 2046.9, 2047.2, 2063.4, 2063.4, 2077.5, 2063.6, 2053.2, 2017, 2024, 2051.6, 2023.1, 2030.8, 2016.8, 2045.1, 2046.3, 2029.6, 2014.1, 2014.1, 2033.3, 2017.4, 2024.9, 1992.6, 1994.9, 1981.6, 1981.6, 1962.2, 1953.7, 1928.8, 1928.3, 1918.1, 1931.4, 1908.8, 1891.8, 1913.9, 1885.8, 1895.8, 1899.6, 1920.3, 1915.3, 1907.3, 1900.6, 1880.9, 1873.5, 1883.6, 1868.5, 1879.1, 1847.8, 1861.8, 1859.4, 1859.4, 1859.4, 1853.3, 1851.2, 1801.8, 1767.9, 1762.7, 1727.5, 1734.6, 1734.6, 1755.4, 1769, 1801.6, 1782.6, 1754.7, 1784.4, 1787.6, 1798, 1793.8, 1777.3, 1755.2, 1737.8, 1730.1, 1722.4, 1753.5, 1757.3, 1736.7, 1734.2, 1724.2, 1744.2, 1689.7, 1667.7, 1667.8, 1687.6, 1687.5, 1684.9, 1674.2, 1711.4, 1780.5, 1779, 1779.3, 1763.7, 1756.8, 1774.2, 1802, 1873.6, 1836.2, 1859.8, 1852.7, 1882.9, 1826.1, 1832.8, 1828.9, 1829.5, 1843.5, 1770.3, 1731.9, 1736.7, 1724, 1683.3, 1611, 1612.5, 1654.2, 1673.9, 1657.3, 1655.1, 1685.1, 1667.9, 1650, 1664.2, 1679.1, 1731.3, 1722.2, 1730.7, 1766.4, 1770.7, 1774.5, 1749.9, 1730.9, 1742.4, 1742.4, 1786.9, 1804.1, 1804.7, 1793.6, 1786.7, 1798.5, 1798.5, 1821.5, 1796.8, 1772.7, 1764.4, 1759.2, 1722.3, 1724.2, 1674.8, 1720.6, 1721, 1739.7, 1749.7, 1771.4, 1792.3, 1783.3, 1799.4, 1781.7, 1788.6, 1765.9, 1791.2, 1769.5, 1758.7, 1738.3, 1744.8, 1736.7, 1735.2, 1760.1, 1786.3, 1824.4, 1821.1, 1854.6, 1854.6, 1857.5, 1870.3, 1858.8, 1857.8, 1857.8, 1843.1, 1850.8, 1859.6, 1844.5, 1852.6 )
numFTSE <- c( 2443.6, 2460.2, 2448.2, 2470.4, 2484.7, 2466.8, 2487.9, 2508.4, 2510.5, 2497.4, 2532.5, 2556.8, 2561, 2547.3, 2541.5, 2558.5, 2587.9, 2580.5, 2579.6, 2589.3, 2595, 2595.6, 2588.8, 2591.7, 2601.7, 2585.4, 2573.3, 2597.4, 2600.6, 2570.6, 2569.4, 2584.9, 2608.8, 2617.2, 2621, 2540.5, 2554.5, 2601.9, 2623, 2640.7, 2640.7, 2619.8, 2624.2, 2638.2, 2645.7, 2679.6, 2669, 2664.6, 2663.3, 2667.4, 2653.2, 2630.8, 2626.6, 2641.9, 2625.8, 2606, 2594.4, 2583.6, 2588.7, 2600.3, 2579.5, 2576.6, 2597.8, 2595.6, 2599, 2621.7, 2645.6, 2644.2, 2625.6, 2624.6, 2596.2, 2599.5, 2584.1, 2570.8, 2555, 2574.5, 2576.7, 2579, 2588.7, 2601.1, 2575.7, 2559.5, 2561.1, 2528.3, 2514.7, 2558.5, 2553.3, 2577.1, 2566, 2549.5, 2527.8, 2540.9, 2534.2, 2538, 2559, 2554.9, 2575.5, 2546.5, 2561.6, 2546.6, 2502.9, 2463.1, 2472.6, 2463.5, 2446.3, 2456.2, 2471.5, 2447.5, 2428.6, 2420.2, 2414.9, 2420.2, 2423.8, 2407, 2388.7, 2409.6, 2392, 2380.2, 2423.3, 2451.6, 2440.8, 2432.9, 2413.6, 2391.6, 2358.1, 2345.4, 2384.4, 2384.4, 2384.4, 2418.7, 2420, 2493.1, 2493.1, 2492.8, 2504.1, 2493.2, 2482.9, 2467.1, 2497.9, 2477.9, 2490.1, 2516.3, 2537.1, 2541.6, 2536.7, 2544.9, 2543.4, 2522, 2525.3, 2510.4, 2539.9, 2552, 2546.5, 2550.8, 2571.2, 2560.2, 2556.8, 2547.1, 2534.3, 2517.2, 2538.4, 2537.1, 2523.7, 2522.6, 2513.9, 2541, 2555.9, 2536.7, 2543.4, 2542.3, 2559.7, 2546.8, 2565, 2562, 2562.1, 2554.3, 2565.4, 2558.4, 2538.3, 2533.1, 2550.7, 2574.8, 2522.4, 2493.3, 2476, 2470.7, 2491.2, 2464.7, 2467.6, 2456.6, 2441, 2458.7, 2464.9, 2472.2, 2447.9, 2452.9, 2440.1, 2408.6, 2405.4, 2382.7, 2400.9, 2404.2, 2393.2, 2436.4, 2572.6, 2591, 2600.5, 2640.2, 2638.6, 2638.6, 2638.6, 2625.8, 2607.8, 2609.8, 2643, 2658.2, 2651, 2664.9, 2654.1, 2659.8, 2659.8, 2662.2, 2698.7, 2701.9, 2725.7, 2737.8, 2722.4, 2720.5, 2694.7, 2682.6, 2703.6, 2700.6, 2711.9, 2702, 2715, 2715, 2704.6, 2698.6, 2694.2, 2707.6, 2697.6, 2705.9, 2680.9, 2681.9, 2668.5, 2645.8, 2635.4, 2636.1, 2614.1, 2603.7, 2593.6, 2616.3, 2598.4, 2562.7, 2584.8, 2550.3, 2560.6, 2532.6, 2557.3, 2534.1, 2515.8, 2521.2, 2493.9, 2476.1, 2497.1, 2469, 2493.7, 2472.6, 2497.9, 2490.8, 2478.3, 2484, 2486.4, 2483.4, 2431.9, 2403.7, 2415.6, 2387.9, 2399.5, 2377.2, 2348, 2373.4, 2423.2, 2411.6, 2399.6, 2420.2, 2407.5, 2392.8, 2377.6, 2350.1, 2325.7, 2309.6, 2303.1, 2318, 2356.8, 2376.1, 2354.7, 2363.5, 2359.4, 2365.7, 2311.1, 2281, 2285, 2311.6, 2312.6, 2312.6, 2298.4, 2313, 2381.9, 2362.2, 2372.2, 2337.7, 2327.5, 2340.6, 2370.9, 2422.1, 2370, 2378.3, 2483.9, 2567, 2560.1, 2586, 2580.5, 2621.2, 2601, 2560, 2565.5, 2553, 2572.3, 2549.7, 2446.3, 2488.4, 2517.1, 2538.8, 2541.2, 2557.2, 2584.7, 2574.7, 2546.6, 2563.9, 2562.2, 2617, 2645.7, 2658.1, 2669.7, 2661.6, 2669.8, 2650.4, 2642.3, 2658.3, 2687.8, 2705.6, 2691.7, 2711.1, 2702.7, 2695.4, 2714.6, 2696.8, 2726.4, 2697.5, 2679.6, 2679.2, 2704, 2706.2, 2732.4, 2722.9, 2727.1, 2709.6, 2741.8, 2760.1, 2778.8, 2792, 2764.1, 2771, 2759.4, 2754.5, 2769.8, 2750.7, 2726.5, 2716.2, 2721.8, 2717.9, 2732.8, 2740.3, 2789.7, 2807.7, 2842, 2827.4, 2827.5, 2827.5, 2827.5, 2847.8, 2832.5, 2846.5, 2846.5, 2861.5, 2833.6, 2826, 2816.5, 2799.2 )
mtxEU <- matrix(data=c(numDAX, numSMI, numCAC, numFTSE), ncol=4, byrow=FALSE)
colnames(mtxEU) <- c("DAX", "SMI", "CAC", "FTSE")
eu_stocks <- ts(data=mtxEU, start=c(1991, 130), frequency=260)
# Plot eu_stocks
plot(eu_stocks)
# Use this code to convert prices to returns
returns <- eu_stocks[-1,] / eu_stocks[-nrow(eu_stocks),] - 1
# Convert returns to ts
returns <- ts(returns, start = c(1991, 130), frequency = 260)
# Plot returns
plot(returns)
# Use this code to convert prices to log returns
logreturns <- diff(log(eu_stocks))
# Plot logreturns
plot(logreturns)
# Create eu_percentreturns
eu_percentreturns <- ts(data=100 * (eu_stocks[-1,] / eu_stocks[-nrow(eu_stocks),] - 1),
start=c(1991, 130), frequency=260
)
str(eu_percentreturns)
## mts [1:399, 1:4] -0.933 -0.44 0.903 -0.173 -0.47 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "DAX" "SMI" "CAC" "FTSE"
## - attr(*, "tsp")= num [1:3] 1991 1993 260
## - attr(*, "class")= chr [1:3] "mts" "ts" "matrix"
# Generate means from eu_percentreturns
colMeans(eu_percentreturns)
## DAX SMI CAC FTSE
## -0.01093221 0.05714059 0.01778921 0.03823335
# Use apply to calculate sample variance from eu_percentreturns
apply(eu_percentreturns, MARGIN = 2, FUN = var)
## DAX SMI CAC FTSE
## 0.9700197 0.7789079 1.3477730 0.8417013
# Use apply to calculate standard deviation from eu_percentreturns
apply(eu_percentreturns, MARGIN = 2, FUN = sd)
## DAX SMI CAC FTSE
## 0.9848958 0.8825576 1.1609363 0.9174428
# Display a histogram of percent returns for each index
par(mfrow = c(2,2))
apply(eu_percentreturns, MARGIN = 2, FUN = hist, main = "", xlab = "Percentage Return")
## $DAX
## $breaks
## [1] -10 -8 -6 -4 -2 0 2 4 6
##
## $counts
## [1] 1 0 1 4 208 178 5 2
##
## $density
## [1] 0.001253133 0.000000000 0.001253133 0.005012531 0.260651629 0.223057644
## [7] 0.006265664 0.002506266
##
## $mids
## [1] -9 -7 -5 -3 -1 1 3 5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $SMI
## $breaks
## [1] -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4
##
## $counts
## [1] 1 0 0 0 1 0 4 21 157 184 24 5 2
##
## $density
## [1] 0.002506266 0.000000000 0.000000000 0.000000000 0.002506266
## [6] 0.000000000 0.010025063 0.052631579 0.393483709 0.461152882
## [11] 0.060150376 0.012531328 0.005012531
##
## $mids
## [1] -8.5 -7.5 -6.5 -5.5 -4.5 -3.5 -2.5 -1.5 -0.5 0.5 1.5 2.5 3.5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $CAC
## $breaks
## [1] -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5
##
## $counts
## [1] 1 0 0 1 4 8 38 154 128 52 8 3 2
##
## $density
## [1] 0.002506266 0.000000000 0.000000000 0.002506266 0.010025063
## [6] 0.020050125 0.095238095 0.385964912 0.320802005 0.130325815
## [11] 0.020050125 0.007518797 0.005012531
##
## $mids
## [1] -7.5 -6.5 -5.5 -4.5 -3.5 -2.5 -1.5 -0.5 0.5 1.5 2.5 3.5 4.5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $FTSE
## $breaks
## [1] -5 -4 -3 -2 -1 0 1 2 3 4 5 6
##
## $counts
## [1] 1 1 4 25 178 148 34 4 2 1 1
##
## $density
## [1] 0.002506266 0.002506266 0.010025063 0.062656642 0.446115288
## [6] 0.370927318 0.085213033 0.010025063 0.005012531 0.002506266
## [11] 0.002506266
##
## $mids
## [1] -4.5 -3.5 -2.5 -1.5 -0.5 0.5 1.5 2.5 3.5 4.5 5.5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
# Display normal quantile plots of percent returns for each index
par(mfrow = c(2,2))
apply(eu_percentreturns, MARGIN = 2, FUN = qqnorm, main = "")
## $DAX
## $DAX$x
## [1] -1.362841938 -0.645200916 1.142773047 -0.332716397 -0.700349168
## [6] 1.522756851 0.880762990 -0.468960676 0.928046482 0.228125248
## [11] -0.862393206 -0.749370236 -0.757715106 0.399909659 0.346025823
## [16] 0.533070235 -0.957438716 -0.716497500 -0.774565430 0.138653062
## [21] 0.957438716 0.306270765 0.157708228 -0.125977957 0.434176329
## [26] -0.835371144 1.412187579 -0.170443132 0.386335624 -0.622163162
## [31] 1.794824260 -0.069160134 0.504322046 -0.100686285 -3.022583937
## [36] 1.861620217 3.022583937 1.465233793 1.130785550 1.192455456
## [41] -0.562265945 0.562265945 0.151350483 -0.176820835 0.454981140
## [46] 0.497200571 -0.724642010 -0.075458866 -0.306270765 -0.692343235
## [51] -1.073140638 0.332716397 0.461959623 0.176820835 -0.273510070
## [56] 0.577043938 -1.259028466 0.427283386 -0.791638608 0.247512940
## [61] 0.890060160 -0.215248044 -0.977501770 -0.221682051 -0.668586325
## [66] 0.475984791 -0.202406436 -0.037702589 -0.299693408 -1.465233793
## [71] -0.614557305 -1.301806749 0.094374049 0.189598120 0.253995872
## [76] 1.118958381 -1.412187579 -0.766113077 0.441089963 0.637484161
## [81] 0.732834875 1.095761965 -1.564098295 -0.119648113 0.783073486
## [86] 0.899434908 -0.319465652 -0.569639391 -0.684381435 -0.461959623
## [91] 0.937753841 -0.434176329 0.826498615 2.027546869 -0.483032470
## [96] 1.429424692 -0.783073486 0.997966220 -0.050279388 -1.735192204
## [101] -1.483865480 -0.031416549 1.107285697 -0.379575363 -0.808945725
## [106] 1.051055539 -1.154927051 -0.234577930 -1.522756851 -1.794824260
## [111] -0.081760594 0.652956285 0.352703444 -0.260489498 0.606986835
## [116] -1.018857387 -0.454981140 0.692343235 0.749370236 -0.504322046
## [121] 0.977501770 0.791638608 -0.997966220 -1.192455456 -0.366106357
## [126] -0.025131751 -0.018847945 -0.012564883 1.707553094 1.073140638
## [131] -0.006282318 0.000000000 1.585812035 0.668586325 0.119648113
## [136] -0.987682290 -1.029471543 1.395360129 1.154927051 0.716497500
## [141] 0.844309926 2.203366572 0.100686285 0.622163162 1.218437810
## [146] 0.241040394 -0.591949043 -0.741077227 -0.253995872 1.331704246
## [151] -0.151350483 -1.205344920 0.987682290 0.299693408 0.006282318
## [156] -0.577043938 0.724642010 -0.228125248 0.339363596 -0.326083868
## [161] -0.441089963 0.660751127 -0.406724252 -0.448024745 1.008356792
## [166] 0.808945725 -0.533070235 1.503027005 0.012564883 1.378918772
## [171] -1.051055539 1.898394677 0.215248044 0.170443132 0.273510070
## [176] 1.084381938 -0.164072354 0.018847945 -1.245269831 0.676462784
## [181] -0.062864145 -1.040202966 -1.331704246 0.908889378 -0.937753841
## [186] 0.871541335 -0.346025823 -0.399909659 0.614557305 -1.395360129
## [191] 0.234577930 0.312861400 -0.132312852 -1.631632667 1.205344920
## [196] 0.420410685 -0.967421566 1.287284949 -0.599450994 1.564098295
## [201] -0.157708228 -1.707553094 0.835371144 0.853316686 -0.386335624
## [206] 0.511469191 0.862393206 0.293128990 0.025131751 0.031416549
## [211] 0.113323060 0.629805182 -0.660751127 0.107002537 -0.293128990
## [216] -0.928046482 -0.094374049 -0.352703444 0.037702589 0.372832405
## [221] 0.280037647 0.967421566 0.684381435 -0.420410685 0.286577179
## [226] -0.286577179 -0.208822935 -0.899434908 -1.273029655 2.375107084
## [231] 0.359396830 1.362841938 -0.189598120 1.447097300 0.741077227
## [236] -1.008356792 -0.540325710 0.043990118 1.040202966 -0.676462784
## [241] 0.260489498 -1.084381938 0.591949043 -0.241040394 0.050279388
## [246] -0.393113587 0.525842714 -0.800262203 0.708400243 -1.062033337
## [251] 0.800262203 -0.629805182 0.056570646 0.266994125 -0.490104222
## [256] 0.195998259 -0.826498615 0.208822935 -0.947550382 0.366106357
## [261] -0.138653062 0.554922943 -0.043990118 1.231742970 -0.525842714
## [266] -0.312861400 -1.107285697 0.774565430 -0.844309926 -1.543097927
## [271] 0.483032470 -0.183205739 -0.339363596 -2.027546869 -2.375107084
## [276] 0.164072354 -1.503027005 -1.167254099 -0.871541335 0.547609740
## [281] -0.113323060 1.543097927 -1.378918772 -1.142773047 -0.372832405
## [286] 0.569639391 1.245269831 -0.606986835 -0.547609740 -2.203366572
## [291] -1.585812035 -1.218437810 0.125977957 1.018857387 -0.511469191
## [296] -1.861620217 -0.554922943 -1.608300307 1.681160057 -2.496817918
## [301] -1.827203533 1.029471543 2.496817918 0.490104222 0.947550382
## [306] -1.287284949 -0.144998850 2.079254280 -0.890060160 0.518642559
## [311] -0.247512940 -1.118958381 0.448024745 0.406724252 2.672947708
## [316] -1.764224226 1.179761118 -1.447097300 1.631632667 -2.280865771
## [321] -0.853316686 -1.316608391 -0.584480259 -1.980752397 -2.137203375
## [326] 0.379575363 -1.347109832 1.301806749 -0.918425797 -2.672947708
## [331] 1.655888698 1.167254099 1.062033337 -0.880762990 0.540325710
## [336] 2.137203375 -1.655888698 -0.056570646 0.599450994 1.827203533
## [341] 1.980752397 -0.817690678 1.273029655 1.483865480 0.645200916
## [346] 0.319465652 -2.079254280 -1.429424692 -0.413557785 -0.732834875
## [351] 0.144998850 -1.179761118 1.259028466 0.393113587 1.735192204
## [356] 0.766113077 -0.518642559 2.280865771 -0.107002537 0.132312852
## [361] 0.817690678 0.062864145 -0.652956285 0.584480259 -1.898394677
## [366] 0.413557785 -1.231742970 0.918425797 0.757715106 1.316608391
## [371] 0.221682051 -1.681160057 0.326083868 -0.475984791 -0.637484161
## [376] -1.130785550 -0.280037647 -1.937931511 -0.427283386 0.183205739
## [381] 0.202406436 -0.266994125 -0.497200571 1.937931511 1.764224226
## [386] 0.468960676 0.700349168 0.069160134 0.075458866 1.608300307
## [391] -0.195998259 -0.708400243 0.081760594 0.088065570 -0.088065570
## [396] 1.347109832 -0.359396830 -1.095761965 -0.908889378
##
## $DAX$y
## [1] -0.933202358 -0.440009916 0.902583256 -0.172732881 -0.469657644
## [6] 1.254190985 0.576404219 -0.286550421 0.635891165 0.115438362
## [11] -0.576526277 -0.512726607 -0.515369041 0.197348134 0.178494491
## [16] 0.270336692 -0.661764706 -0.481125093 -0.520639643 0.049844237
## [21] 0.678789388 0.160821426 0.074106095 -0.049367479 0.209915416
## [26] -0.560655536 1.109045849 -0.091917397 0.190137390 -0.428527701
## [31] 1.469412850 -0.018177412 0.242409551 -0.030227919 -9.179970972
## [36] 1.498202157 5.208948370 1.172289081 0.875192604 0.940917700
## [41] -0.381332849 0.279499332 0.066650509 -0.096881623 0.218195042
## [46] 0.241911098 -0.482654600 -0.024249773 -0.163725669 -0.467687075
## [51] -0.787209373 0.172222906 0.221048754 0.091900502 -0.153026872
## [56] 0.282000981 -0.886416432 0.209708259 -0.529328491 0.123754718
## [61] 0.599468512 -0.122865217 -0.676589986 -0.123854347 -0.452628968
## [66] 0.236686391 -0.111849873 0.000000000 -0.161741835 -0.984485015
## [71] -0.427915172 -0.903747709 0.012755102 0.095651065 0.127412881
## [76] 0.871667621 -0.971363694 -0.515923567 0.211281132 0.325836954
## [81] 0.426670063 0.843373494 -1.106709426 -0.038150951 0.451625215
## [86] 0.601570415 -0.169950274 -0.390920555 -0.455753893 -0.286150324
## [91] 0.644091576 -0.272462299 0.520998793 1.776120346 -0.310520432
## [96] 1.158734114 -0.523463481 0.730514456 -0.006145904 -1.309157959
## [101] -0.996450146 0.000000000 0.855507328 -0.218299757 -0.556319540
## [106] 0.792004526 -0.848144684 -0.125794075 -1.058001134 -1.355738018
## [111] -0.025809782 0.342067897 0.180099054 -0.141252006 0.289333248
## [116] -0.730862931 -0.284164299 0.401554404 0.438653077 -0.333975594
## [121] 0.715298363 0.473478789 -0.700503089 -0.859359969 -0.200530435
## [126] 0.000000000 0.000000000 0.000000000 1.393570132 0.831042639
## [131] 0.000000000 0.000000000 1.325049135 0.362908272 0.043640898
## [136] -0.685486384 -0.752964799 1.100082190 0.919267088 0.415169166
## [141] 0.555384141 2.098803314 0.024042796 0.312481221 0.946504523
## [146] 0.118687318 -0.414913165 -0.494018213 -0.137576265 1.030248577
## [151] -0.071144839 -0.860278849 0.724117295 0.154476858 0.000000000
## [156] -0.409325503 0.416964498 -0.124569937 0.172239710 -0.171943555
## [161] -0.273207816 0.351378715 -0.249258160 -0.273679200 0.733802649
## [166] 0.473793308 -0.371352785 1.248372974 0.000000000 1.086893005
## [171] -0.768830568 1.561225679 0.103246530 0.091680037 0.143118846
## [176] 0.834619562 -0.090708090 0.000000000 -0.879532429 0.383558507
## [181] -0.017108640 -0.758612822 -0.919593080 0.609084054 -0.651522140
## [186] 0.568742383 -0.173120203 -0.242788600 0.301326998 -0.964815992
## [191] 0.116672500 0.168977975 -0.052352975 -1.129088581 0.941841300
## [196] 0.204105435 -0.675085841 0.978496514 -0.417778809 1.316862836
## [201] -0.080515298 -1.300794290 0.548168883 0.556779956 -0.219171761
## [206] 0.260115607 0.565004324 0.149056928 0.000000000 0.000000000
## [211] 0.034346557 0.320457797 -0.450630312 0.028650011 -0.160394111
## [216] -0.642607149 -0.028873361 -0.179066543 0.000000000 0.185174469
## [221] 0.144400162 0.692121352 0.389506244 -0.256761383 0.148732910
## [226] -0.159936026 -0.120144173 -0.601443464 -0.887454619 2.540845398
## [231] 0.181447040 1.052750736 -0.100817744 1.166180758 0.437818665
## [236] -0.706284831 -0.372325646 0.000000000 0.758589915 -0.453941541
## [241] 0.133466800 -0.799733422 0.285522338 -0.128398370 0.000000000
## [246] -0.240357742 0.268952765 -0.547639005 0.410181491 -0.772243984
## [251] 0.473719829 -0.437808711 0.000000000 0.140940354 -0.315262062
## [256] 0.096007229 -0.558564658 0.102127660 -0.651816584 0.182565039
## [261] -0.056947608 0.279202279 -0.005682141 0.948971474 -0.365887982
## [266] -0.169491525 -0.831918506 0.450836044 -0.562436087 -1.079814889
## [271] 0.236802588 -0.097954480 -0.173030338 -1.808412295 -2.753751103
## [276] 0.084709869 -1.045885980 -0.855327468 -0.579245748 0.272716003
## [281] -0.037087403 1.261439525 -0.940400586 -0.844532117 -0.211377059
## [286] 0.280356364 0.950546720 -0.418487292 -0.376985353 -2.047146402
## [291] -1.114629512 -0.864608685 0.045222560 0.749063670 -0.346109473
## [296] -1.472858245 -0.378614792 -1.120503244 1.391650099 -2.941176471
## [301] -1.414141414 0.758196721 2.779472578 0.237451355 0.677765348
## [306] -0.888888889 -0.059351095 1.873968987 -0.595893516 0.260637258
## [311] -0.129979853 -0.839461183 0.216563854 0.202999149 4.659521631
## [316] -1.323758976 0.936531038 -0.977995110 1.335865780 -2.161689366
## [321] -0.574712644 -0.918432884 -0.414857069 -1.666341209 -1.965976038
## [326] 0.189061445 -0.923305028 0.999931977 -0.633081897 -4.954588586
## [331] 1.369179206 0.921561731 0.815558344 -0.594620756 0.271266606
## [336] 2.081021088 -1.195977168 -0.013755158 0.288898060 1.481481481
## [341] 1.709921600 -0.558176623 0.968927497 1.184645930 0.333573157
## [346] 0.169491525 -1.848236366 -0.974671794 -0.254435889 -0.490031550
## [351] 0.060712358 -0.856199016 0.965592275 0.195312500 1.404853129
## [356] 0.444120377 -0.362964429 2.152602994 -0.032419114 0.045401479
## [361] 0.505672609 0.000000000 -0.445075147 0.285084878 -1.563509497
## [366] 0.203465477 -0.871160018 0.634333289 0.439921208 1.006733346
## [371] 0.110025241 -1.208947505 0.170145933 -0.293983145 -0.438998821
## [376] -0.842382363 -0.159288511 -1.568836003 -0.270142500 0.094805986
## [381] 0.101481632 -0.141930251 -0.331641286 1.561863371 1.464295266
## [386] 0.224052718 0.407653363 0.000000000 0.000000000 1.335865366
## [391] -0.109854604 -0.478716522 0.000000000 0.000000000 -0.026001040
## [396] 1.040312094 -0.180180180 -0.825167612 -0.604524181
##
##
## $SMI
## $SMI$x
## [1] 0.826498615 -0.997966220 0.386335624 0.113323060 -1.378918772
## [6] 0.918425797 1.735192204 -0.629805182 1.608300307 0.577043938
## [11] 0.525842714 -0.379575363 0.413557785 -0.050279388 -0.599450994
## [16] 0.676462784 -1.287284949 -0.062864145 -0.511469191 0.062864145
## [21] 0.208822935 0.050279388 -0.195998259 0.234577930 -0.749370236
## [26] -0.732834875 0.511469191 -0.215248044 0.346025823 -0.645200916
## [31] 0.088065570 0.700349168 0.332716397 -1.142773047 -3.022583937
## [36] 2.496817918 3.022583937 1.301806749 1.412187579 0.622163162
## [41] -0.928046482 0.716497500 0.899434908 -0.247512940 -0.454981140
## [46] -0.260489498 -0.468960676 -0.018847945 -0.774565430 -0.533070235
## [51] -0.684381435 0.221682051 -0.056570646 0.247512940 -1.429424692
## [56] -0.189598120 -1.465233793 0.075458866 0.144998850 -0.660751127
## [61] -0.504322046 -0.724642010 -0.987682290 0.497200571 -0.525842714
## [66] 0.591949043 1.259028466 -0.253995872 -0.202406436 -1.062033337
## [71] 0.037702589 -0.483032470 -0.844309926 0.504322046 0.006282318
## [76] 0.183205739 1.395360129 0.195998259 1.631632667 -0.346025823
## [81] -0.826498615 0.454981140 -0.273510070 -1.827203533 0.800262203
## [86] 0.000000000 -0.708400243 -0.899434908 -0.540325710 -0.577043938
## [91] 0.774565430 0.260489498 0.372832405 1.861620217 -0.366106357
## [96] 1.245269831 -0.427283386 0.406724252 -0.266994125 -2.203366572
## [101] -1.764224226 -0.393113587 -0.591949043 -1.681160057 -1.608300307
## [106] 1.681160057 -1.231742970 -0.406724252 -1.447097300 -2.027546869
## [111] 1.447097300 1.051055539 -1.395360129 -0.461959623 0.241040394
## [116] -0.800262203 -1.130785550 1.142773047 1.429424692 0.012564883
## [121] 0.228125248 -0.025131751 -1.218437810 -1.245269831 -0.475984791
## [126] 1.980752397 -0.183205739 -0.176820835 1.937931511 1.192455456
## [131] -0.170443132 -0.164072354 -0.157708228 2.137203375 0.599450994
## [136] -1.095761965 -1.018857387 1.827203533 0.606986835 -0.441089963
## [141] 1.205344920 2.375107084 -1.107285697 -1.192455456 0.132312852
## [146] 0.299693408 -0.234577930 -0.399909659 0.018847945 1.378918772
## [151] -0.286577179 -1.483865480 -0.359396830 0.468960676 -0.241040394
## [156] -0.094374049 0.908889378 0.957438716 0.107002537 0.379575363
## [161] 0.997966220 -0.434176329 -0.006282318 -1.301806749 0.684381435
## [166] 0.253995872 -0.908889378 1.231742970 0.176820835 0.928046482
## [171] -0.569639391 0.853316686 0.427283386 0.937753841 -0.069160134
## [176] 0.280037647 0.138653062 -1.008356792 -1.543097927 -0.518642559
## [181] 0.312861400 -1.179761118 -1.362841938 1.107285697 -0.716497500
## [186] 1.522756851 0.056570646 0.967421566 0.151350483 -1.585812035
## [191] 0.366106357 0.393113587 0.319465652 -0.668586325 0.741077227
## [196] -0.306270765 -0.957438716 0.817690678 -0.835371144 1.029471543
## [201] 0.533070235 -1.937931511 0.043990118 1.465233793 -0.700349168
## [206] 0.977501770 1.543097927 -0.319465652 -0.151350483 -0.144998850
## [211] -0.977501770 1.018857387 0.286577179 -0.547609740 -0.031416549
## [216] -1.347109832 1.073140638 0.352703444 -0.138653062 1.764224226
## [221] 0.890060160 1.218437810 -0.221682051 0.862393206 1.008356792
## [226] -1.084381938 0.081760594 -0.757715106 -1.794824260 1.287284949
## [231] -0.967421566 0.273510070 -0.692343235 0.339363596 -0.326083868
## [236] -0.420410685 -0.880762990 -0.132312852 0.399909659 0.100686285
## [241] 0.490104222 -1.259028466 0.031416549 -0.339363596 -0.125977957
## [246] -1.154927051 -1.040202966 -1.522756851 0.766113077 -1.205344920
## [251] 1.118958381 -0.766113077 -0.947550382 0.461959623 -1.735192204
## [256] -0.088065570 -0.817690678 0.540325710 -0.937753841 0.791638608
## [261] 0.584480259 -0.862393206 0.652956285 1.154927051 -0.741077227
## [266] -0.584480259 -1.564098295 0.637484161 0.708400243 -1.273029655
## [271] -0.652956285 0.692343235 -0.562265945 -2.137203375 -2.496817918
## [276] 1.898394677 -1.980752397 0.359396830 -0.614557305 0.293128990
## [281] 1.316608391 2.079254280 -0.637484161 -0.783073486 -0.372832405
## [286] 0.757715106 1.095761965 -0.228125248 -0.918425797 -0.890060160
## [291] -0.075458866 -0.676462784 -0.808945725 0.518642559 0.614557305
## [296] -1.073140638 -1.118958381 -1.898394677 1.130785550 -2.375107084
## [301] -1.655888698 -0.554922943 2.280865771 0.306270765 0.420410685
## [306] -0.490104222 0.783073486 1.273029655 -0.081760594 1.062033337
## [311] -0.208822935 -0.853316686 1.040202966 1.503027005 2.672947708
## [316] -1.707553094 1.655888698 1.167254099 0.554922943 -0.791638608
## [321] 0.808945725 0.947550382 -0.386335624 -0.037702589 -2.079254280
## [326] -0.012564883 -0.280037647 0.483032470 -1.412187579 -2.672947708
## [331] 2.027546869 2.203366572 0.871541335 0.562265945 0.125977957
## [336] 0.835371144 -0.413557785 -1.051055539 0.448024745 1.564098295
## [341] 1.362841938 -1.167254099 0.119648113 0.668586325 0.660751127
## [346] -0.332716397 -1.503027005 -0.312861400 0.266994125 0.724642010
## [351] 0.844309926 0.569639391 0.629805182 0.170443132 0.025131751
## [356] 0.215248044 -1.316608391 0.157708228 -1.631632667 -1.331704246
## [361] -1.861620217 -0.622163162 0.880762990 0.069160134 -0.606986835
## [366] -1.029471543 0.475984791 0.202406436 1.707553094 1.179761118
## [371] -0.293128990 -0.497200571 0.094374049 0.547609740 1.483865480
## [376] 0.434176329 1.347109832 0.441089963 0.326083868 1.585812035
## [381] -0.871541335 1.331704246 0.189598120 1.084381938 -0.043990118
## [386] 0.732834875 0.164072354 -0.119648113 -0.113323060 1.794824260
## [391] -0.299693408 0.987682290 -0.107002537 -0.100686285 0.749370236
## [396] 0.645200916 -0.352703444 -2.280865771 -0.448024745
##
## $SMI$y
## [1] 0.619748525 -0.586319218 0.327653997 0.148447242 -0.889363216
## [6] 0.675999043 1.230019609 -0.358065274 1.107511046 0.436986541
## [11] 0.388676181 -0.179138977 0.341553780 0.040385392 -0.328719723
## [16] 0.509170862 -0.805940936 0.029017469 -0.266883268 0.127981385
## [21] 0.238205903 0.110125775 0.000000000 0.254747568 -0.433125433
## [26] -0.417609187 0.378589318 -0.023209934 0.313406849 -0.364498959
## [31] 0.139364729 0.527689185 0.311490540 -0.736055204 -8.040783223
## [36] 2.721431271 3.366858825 0.937407298 0.981602304 0.477299185
## [41] -0.556134863 0.535943143 0.672152045 -0.046045816 -0.241851895
## [46] -0.069268067 -0.248382625 0.052116509 -0.439865725 -0.279037321
## [51] -0.390579457 0.251653304 0.029188558 0.262620368 -0.948777648
## [56] 0.000000000 -0.987248046 0.136506618 0.201517307 -0.372648764
## [61] -0.261236122 -0.416691470 -0.585809074 0.372797787 -0.275564608
## [66] 0.456538716 0.920887401 -0.053327013 -0.011856770 -0.640341515
## [71] 0.095476787 -0.250387504 -0.490078891 0.378378378 0.065817029
## [76] 0.227218369 0.978403532 0.230414747 1.155319776 -0.163160655
## [81] -0.478608533 0.363615037 -0.087652662 -1.239911101 0.609972758
## [86] 0.064747778 -0.411764706 -0.537507383 -0.279113962 -0.315626489
## [91] 0.597407253 0.267236772 0.325752191 1.593954779 -0.174327387
## [96] 0.913906514 -0.224965390 0.341099613 -0.086425444 -2.006804683
## [101] -1.218148649 -0.184677708 -0.322291853 -1.071792108 -1.041036194
## [106] 1.217125382 -0.779503293 -0.200974421 -0.982486117 -1.633181314
## [111] 1.021239271 0.781443810 -0.892307692 -0.242160820 0.255197311
## [116] -0.471844540 -0.723598029 0.860823123 1.009220035 0.067842605
## [121] 0.252696456 0.049182344 -0.768096350 -0.780234070 -0.249641141
## [126] 1.795657887 0.000000000 0.000000000 1.763982790 0.869722776
## [131] 0.000000000 0.000000000 0.000000000 2.029818574 0.457746479
## [136] -0.660123846 -0.599823581 1.484943501 0.472193075 -0.237888019
## [141] 0.872397348 2.214022140 -0.705099278 -0.744191331 0.171703297
## [146] 0.291395269 -0.039879223 -0.193776359 0.074234810 0.964336662
## [151] -0.101729400 -0.995700385 -0.165714286 0.366321332 -0.039920160
## [156] 0.005705157 0.673170175 0.685668952 0.140702386 0.325970887
## [161] 0.733852445 -0.228005784 0.061312078 -0.824420677 0.511121096
## [166] 0.262643196 -0.546204437 0.879847568 0.216654630 0.676274945
## [171] -0.313842088 0.629660315 0.345792854 0.678262772 0.027165055
## [176] 0.282439846 0.200400802 -0.594594595 -1.022294725 -0.274695088
## [181] 0.297487880 -0.741513787 -0.868795307 0.826169476 -0.415236408
## [186] 1.084116306 0.126498735 0.703103543 0.207276496 -1.028795384
## [191] 0.324496755 0.328929335 0.300530026 -0.375898889 0.574178378
## [196] -0.108742932 -0.571521881 0.618601850 -0.484221980 0.765403750
## [201] 0.401497477 -1.264523102 0.109463084 1.033295063 -0.405844156
## [206] 0.722629720 1.095048009 -0.149405048 0.000000000 0.000000000
## [211] -0.577138887 0.763235689 0.288046087 -0.287218765 0.048007681
## [216] -0.858391981 0.790535090 0.314800982 0.000000000 1.468007021
## [221] 0.670965036 0.874772195 -0.025809116 0.635068154 0.749063670
## [226] -0.656923155 0.138404757 -0.435116458 -1.228791774 0.936963198
## [231] -0.572430509 0.280082988 -0.398262129 0.311575012 -0.155303619
## [236] -0.217763260 -0.535203949 0.000000000 0.334343329 0.140581068
## [241] 0.369157178 -0.787401575 0.088763576 -0.161719443 0.000000000
## [246] -0.736754102 -0.615886719 -1.016949153 0.593964041 -0.755359328
## [251] 0.836147291 -0.435868814 -0.565906786 0.365100671 -1.198309528
## [256] 0.010828957 -0.476422500 0.407985639 -0.558023621 0.604739853
## [261] 0.444059352 -0.512184602 0.498563919 0.862766244 -0.422346966
## [266] -0.322130355 -1.023376064 0.495211145 0.530676342 -0.791812550
## [271] -0.369204040 0.512261580 -0.298199957 -1.810865191 -2.713779353
## [276] 1.679380622 -1.623649292 0.324398156 -0.340367597 0.290300546
## [281] 0.942164709 1.922968794 -0.364097755 -0.442943359 -0.177965630
## [286] 0.584990807 0.825301872 -0.038455200 -0.549571334 -0.536030062
## [291] 0.022223457 -0.388824085 -0.473986505 0.386597938 0.474409778
## [296] -0.644372847 -0.721234485 -1.250211184 0.838323353 -2.279153942
## [301] -1.059089068 -0.292466074 2.111932418 0.293002413 0.343701667
## [306] -0.251184564 0.600927145 0.932984412 0.011272686 0.783363390
## [311] -0.016775709 -0.497762864 0.770052274 1.070950469 3.316777042
## [316] -1.105710165 1.188289943 0.864737910 0.418077900 -0.458498024
## [321] 0.614146548 0.684066512 -0.182920456 0.047122886 -1.653757588
## [326] 0.053214134 -0.101053079 0.367353458 -0.933587948 -4.262154637
## [331] 1.817673378 2.070859654 0.651167797 0.433085601 0.165034072
## [336] 0.621844273 -0.206000423 -0.635155878 0.356895541 1.098726115
## [341] 0.945030713 -0.738544755 0.151951795 0.507481427 0.504919057
## [346] -0.155376010 -1.011515717 -0.125766389 0.272836980 0.549421799
## [351] 0.624479600 0.434422838 0.489186406 0.215219062 0.086925398
## [356] 0.250332073 -0.825561841 0.210677766 -1.051174239 -0.844690885
## [361] -1.249085398 -0.354591162 0.669216061 0.131898280 -0.337214816
## [366] -0.602696273 0.367001755 0.233174351 1.226604631 0.867021832
## [371] -0.103562552 -0.253991292 0.140310762 0.415153088 1.054263566
## [376] 0.347754935 0.942819284 0.353410410 0.301856417 1.103475949
## [381] -0.530832961 0.942643392 0.227283957 0.808479172 0.044011932
## [386] 0.562127285 0.213872551 0.000000000 0.000000000 1.484212058
## [391] -0.105147445 0.732022391 0.000000000 0.000000000 0.584212026
## [396] 0.495820938 -0.164458228 -2.108532969 -0.240396173
##
##
## $CAC
## $CAC$x
## [1] -1.287284949 -1.707553094 -0.676462784 0.871541335 -0.629805182
## [6] 1.130785550 1.273029655 -0.312861400 0.075458866 0.372832405
## [11] -0.352703444 0.221682051 -0.119648113 0.393113587 -0.125977957
## [16] 0.406724252 0.584480259 0.176820835 -0.835371144 -0.668586325
## [21] -0.151350483 -0.202406436 0.741077227 -0.094374049 -0.346025823
## [26] -0.260489498 1.259028466 0.490104222 -0.454981140 -0.293128990
## [31] 1.522756851 0.692343235 -0.081760594 -0.075458866 -3.022583937
## [36] 1.980752397 2.375107084 1.395360129 0.826498615 1.040202966
## [41] -0.319465652 0.183205739 0.379575363 0.427283386 -0.132312852
## [46] 0.591949043 -0.800262203 0.511469191 -0.540325710 -0.339363596
## [51] -0.977501770 0.189598120 0.987682290 0.844309926 0.170443132
## [56] 0.043990118 -0.577043938 0.668586325 0.339363596 0.228125248
## [61] -0.273510070 -0.490104222 -0.157708228 0.483032470 -0.280037647
## [66] 0.434176329 -0.591949043 -0.716497500 -0.947550382 -0.554922943
## [71] 0.326083868 -0.359396830 0.056570646 -0.652956285 -0.100686285
## [76] 1.231742970 0.420410685 -0.518642559 0.533070235 -1.073140638
## [81] -0.606986835 -0.461959623 0.132312852 0.253995872 0.928046482
## [86] -0.215248044 0.700349168 -0.372832405 -0.069160134 -0.967421566
## [91] 0.359396830 -0.708400243 0.208822935 0.997966220 -0.062864145
## [96] 0.113323060 -0.757715106 1.118958381 -0.399909659 -2.137203375
## [101] -1.980752397 -0.189598120 0.215248044 -1.483865480 0.195998259
## [106] 1.412187579 -0.774565430 -0.286577179 -0.987682290 -1.655888698
## [111] 0.808945725 -0.584480259 -0.766113077 -1.142773047 -1.378918772
## [116] -1.564098295 0.606986835 1.503027005 0.977501770 0.569639391
## [121] 0.862393206 -0.434176329 -1.331704246 -1.861620217 -1.029471543
## [126] 2.672947708 -0.056570646 1.316608391 -0.195998259 1.192455456
## [131] 1.301806749 -0.050279388 -1.018857387 1.095761965 0.957438716
## [136] -0.599450994 0.475984791 2.137203375 0.266994125 -0.844309926
## [141] 1.029471543 1.564098295 -0.853316686 -0.043990118 0.366106357
## [146] 0.151350483 -1.503027005 -0.241040394 0.676462784 1.008356792
## [151] 1.073140638 -0.692343235 -0.420410685 0.138653062 -1.062033337
## [156] -0.113323060 0.562265945 -0.386335624 0.119648113 0.299693408
## [161] -0.164072354 -0.234577930 0.835371144 -0.724642010 0.908889378
## [166] 1.543097927 0.247512940 1.794824260 0.645200916 0.757715106
## [171] -1.218437810 1.347109832 0.273510070 -0.176820835 -0.306270765
## [176] 0.286577179 0.312861400 -0.880762990 -0.427283386 0.547609740
## [181] 0.880762990 -1.347109832 -1.395360129 0.346025823 -1.040202966
## [186] 0.774565430 -0.918425797 -0.441089963 -0.253995872 -1.681160057
## [191] 0.684381435 1.631632667 0.386335624 -0.700349168 1.018857387
## [196] -0.144998850 -0.871541335 0.766113077 0.107002537 1.608300307
## [201] 0.088065570 -2.375107084 1.827203533 1.483865480 -0.660751127
## [206] 0.525842714 0.918425797 -0.783073486 -0.037702589 -0.031416549
## [211] 0.164072354 0.241040394 -0.393113587 0.399909659 0.622163162
## [216] 0.629805182 1.447097300 -0.379575363 -0.025131751 0.577043938
## [221] 0.332716397 0.062864145 0.817690678 -0.018847945 0.724642010
## [226] -0.817690678 -0.622163162 -1.631632667 0.413557785 1.287284949
## [231] -1.465233793 0.461959623 -0.826498615 1.331704246 0.125977957
## [236] -0.957438716 -0.890060160 -0.012564883 0.937753841 -0.899434908
## [241] 0.448024745 -1.585812035 0.202406436 -0.808945725 -0.006282318
## [246] -1.084381938 -0.525842714 -1.316608391 -0.107002537 -0.645200916
## [251] 0.749370236 -1.231742970 -1.008356792 1.107285697 -1.522756851
## [256] 0.599450994 0.280037647 1.062033337 -0.366106357 -0.511469191
## [261] -0.468960676 -1.154927051 -0.504322046 0.614557305 -0.928046482
## [266] 0.637484161 -1.608300307 0.783073486 -0.221682051 0.000000000
## [271] 0.006282318 -0.448024745 -0.208822935 -2.027546869 -1.735192204
## [276] -0.406724252 -1.794824260 0.497200571 0.012564883 1.179761118
## [281] 0.800262203 1.764224226 -1.167254099 -1.543097927 1.655888698
## [286] 0.260489498 0.660751127 -0.332716397 -1.051055539 -1.273029655
## [291] -1.107285697 -0.533070235 -0.547609740 1.681160057 0.306270765
## [296] -1.245269831 -0.247512940 -0.684381435 1.084381938 -2.280865771
## [301] -1.362841938 0.050279388 1.154927051 -0.088065570 -0.266994125
## [306] -0.791638608 1.937931511 3.022583937 -0.170443132 0.069160134
## [311] -0.997966220 -0.497200571 0.967421566 1.465233793 2.496817918
## [316] -1.764224226 1.245269831 -0.475984791 1.585812035 -2.203366572
## [321] 0.441089963 -0.326083868 0.094374049 0.791638608 -2.496817918
## [326] -1.898394677 0.352703444 -0.862393206 -1.937931511 -2.672947708
## [331] 0.144998850 2.079254280 1.167254099 -1.118958381 -0.228125248
## [336] 1.707553094 -1.130785550 -1.179761118 0.853316686 0.890060160
## [341] 2.280865771 -0.637484161 0.554922943 1.861620217 0.319465652
## [346] 0.293128990 -1.447097300 -1.192455456 0.716497500 0.018847945
## [351] 2.027546869 0.947550382 0.100686285 -0.749370236 -0.483032470
## [356] 0.708400243 0.025131751 1.218437810 -1.429424692 -1.412187579
## [361] -0.569639391 -0.413557785 -1.827203533 0.157708228 -2.079254280
## [366] 2.203366572 0.081760594 1.051055539 0.652956285 1.205344920
## [371] 1.142773047 -0.614557305 0.899434908 -1.095761965 0.468960676
## [376] -1.301806749 1.362841938 -1.259028466 -0.732834875 -1.205344920
## [381] 0.454981140 -0.562265945 -0.183205739 1.378918772 1.429424692
## [386] 1.898394677 -0.299693408 1.735192204 0.031416549 0.234577930
## [391] 0.732834875 -0.741077227 -0.138653062 0.037702589 -0.908889378
## [396] 0.504322046 0.540325710 -0.937753841 0.518642559
##
## $CAC$y
## [1] -1.257897112 -1.856612396 -0.576251455 0.878168725 -0.510707446
## [6] 1.178323514 1.320265206 -0.193467623 0.017103763 0.313515362
## [11] -0.244345948 0.148105953 -0.034127752 0.341394026 -0.039693791
## [16] 0.346040390 0.520097236 0.112479613 -0.696590079 -0.554392714
## [21] -0.073951874 -0.108163498 0.689576566 -0.016979851 -0.243419190
## [26] -0.153217569 1.301506110 0.398339318 -0.335289187 -0.173815531
## [31] 1.595147158 0.635780628 0.000000000 0.000000000 -7.295500742
## [36] 2.257777778 3.900092721 1.461319650 0.791600242 1.085355877
## [41] -0.199633107 0.113531924 0.318608921 0.355278032 -0.042911549
## [46] 0.520525892 -0.661968823 0.424548581 -0.444159041 -0.241883466
## [51] -0.829786088 0.114099430 1.009443178 0.805931657 0.111928366
## [56] 0.005323963 -0.479131175 0.588424093 0.271218890 0.153805357
## [61] -0.169455624 -0.387226819 -0.074551361 0.389022116 -0.169869413
## [66] 0.361586728 -0.482144749 -0.596283874 -0.814096728 -0.448188347
## [71] 0.244087655 -0.254315243 0.010849517 -0.531568670 -0.021812630
## [76] 1.281771572 0.350045775 -0.423956209 0.468876314 -0.976290098
## [81] -0.498374865 -0.348432056 0.065559441 0.174710636 0.942882058
## [86] -0.124183359 0.648718780 -0.273928456 0.000000000 -0.818656757
## [91] 0.293239207 -0.584763658 0.136158161 1.033394974 0.000000000
## [96] 0.043066322 -0.624192854 1.175005415 -0.283649987 -2.946543581
## [101] -2.388984129 -0.090646422 0.141763538 -1.398640997 0.114856716
## [106] 1.474215568 -0.627473149 -0.170658172 -0.866146219 -1.776168305
## [111] 0.784176030 -0.481941702 -0.624307136 -1.027477689 -1.322892567
## [116] -1.593122520 0.531492455 1.579970831 0.999042833 0.503465024
## [121] 0.878123527 -0.321317988 -1.289415074 -2.119700748 -0.903851987
## [126] 4.009549461 0.000000000 1.377199694 -0.104499274 1.232056721
## [131] 1.366324129 0.000000000 -0.894829246 1.165780902 0.977235497
## [136] -0.497874245 0.387923765 2.704973118 0.190850101 -0.712963971
## [141] 1.068903141 1.616227357 -0.715200683 0.000000000 0.306418665
## [146] 0.107186880 -1.424059104 -0.141204584 0.609126013 1.048705335
## [151] 1.139463970 -0.581825875 -0.303256012 0.074710497 -0.975843865
## [156] -0.026925148 0.500942634 -0.278700825 0.053746103 0.214868930
## [161] -0.080403087 -0.134112977 0.800386764 -0.607513989 0.911479277
## [166] 1.599277403 0.167346512 1.869061293 0.574005740 0.718507949
## [171] -1.163673160 1.423086767 0.191793267 -0.085638003 -0.186548351
## [176] 0.207102086 0.216755721 -0.759519139 -0.309173847 0.477909401
## [181] 0.890553054 -1.298961834 -1.331300813 0.272942631 -0.919315906
## [186] 0.746423388 -0.792344104 -0.321543408 -0.150884495 -1.797717680
## [191] 0.620821394 1.687496704 0.337084479 -0.584039694 1.055367819
## [196] -0.072023871 -0.736202636 0.720916965 0.036045314 1.652339527
## [201] 0.030382824 -3.437278526 1.892529489 1.569252933 -0.547084747
## [206] 0.458411858 0.932920955 -0.627919827 0.000000000 0.000000000
## [211] 0.111212213 0.166633003 -0.282300751 0.343764218 0.544108016
## [216] 0.546174275 1.500049836 -0.274954583 0.000000000 0.516961253
## [221] 0.259600313 0.014656310 0.791324736 0.000000000 0.683338180
## [226] -0.669073406 -0.503973638 -1.763101500 0.347050074 1.363636364
## [231] -1.389159680 0.380604024 -0.689383494 1.403213011 0.058676837
## [236] -0.816107120 -0.763697280 0.000000000 0.953279380 -0.781980032
## [241] 0.371765639 -1.595140501 0.115427080 -0.666700085 0.000000000
## [246] -0.979006863 -0.433187239 -1.274504786 -0.025922854 -0.528963336
## [251] 0.693394505 -1.170135653 -0.890611903 1.168199598 -1.468206280
## [256] 0.530278927 0.200443085 1.089703095 -0.260375983 -0.417689135
## [261] -0.351281917 -1.036514785 -0.393428678 0.539097945 -0.801656403
## [266] 0.567299973 -1.665691022 0.757657755 -0.128907509 0.000000000
## [271] 0.000000000 -0.328062816 -0.113311390 -2.668539326 -1.881451881
## [276] -0.294134284 -1.996936518 0.410998553 0.000000000 1.199123717
## [281] 0.774752193 1.842849067 -1.054618117 -1.565129586 1.692597025
## [286] 0.179331988 0.581785634 -0.233592881 -0.919834987 -1.243459180
## [291] -0.991340018 -0.443088963 -0.445060979 1.805620065 0.216709438
## [296] -1.172252888 -0.143951172 -0.576634760 1.159958242 -3.124641670
## [301] -1.302006273 0.005996282 1.187192709 -0.005925575 -0.154074074
## [306] -0.635052525 2.221956755 4.037630011 -0.084245998 0.016863406
## [311] -0.876749283 -0.391222997 0.990437158 1.566903393 3.973362930
## [316] -1.996157131 1.285263043 -0.381761480 1.630053436 -3.016623294
## [321] 0.366902141 -0.212789175 0.032806605 0.765236403 -3.970707893
## [326] -2.169123877 0.277152261 -0.731271953 -2.360788863 -4.295134557
## [331] 0.093109870 2.586046512 1.190907992 -0.991696039 -0.132746033
## [336] 1.812579300 -1.020710937 -1.073205828 0.860606061 0.895325081
## [341] 3.108808290 -0.525616589 0.493554756 2.062749177 0.243432971
## [346] 0.214604394 -1.386306002 -1.085776330 0.664394246 0.000000000
## [351] 2.553948577 0.962560860 0.033257580 -0.615060675 -0.384701160
## [356] 0.660435440 0.000000000 1.278843481 -1.356025254 -1.341273375
## [361] -0.468212331 -0.294717751 -2.097544338 0.110317599 -2.865096857
## [366] 2.734654884 0.023247704 1.086577571 0.574811749 1.240212608
## [371] 1.179857740 -0.502148078 0.902820613 -0.983661220 0.387270584
## [376] -1.269149055 1.432697208 -1.211478339 -0.610341904 -1.159947689
## [381] 0.373928551 -0.464236589 -0.086370703 1.434993084 1.488551787
## [386] 2.132900409 -0.180881386 1.839547526 0.000000000 0.156367950
## [391] 0.689098250 -0.614874619 -0.053798149 0.000000000 -0.791258478
## [396] 0.417774402 0.475470067 -0.812002581 0.439143399
##
##
## $FTSE
## $FTSE$x
## [1] 0.800262203 -0.577043938 1.130785550 0.668586325 -0.977501770
## [6] 1.040202966 0.977501770 0.125977957 -0.652956285 1.655888698
## [11] 1.218437810 0.247512940 -0.692343235 -0.247512940 0.774565430
## [16] 1.412187579 -0.312861400 -0.031416549 0.434176329 0.326083868
## [21] 0.075458866 -0.253995872 0.164072354 0.454981140 -0.800262203
## [26] -0.547609740 1.192455456 0.183205739 -1.631632667 -0.056570646
## [31] 0.700349168 1.154927051 0.399909659 0.215248044 -2.672947708
## [36] 0.637484161 2.027546869 0.947550382 0.783073486 -0.012564883
## [41] -1.084381938 0.253995872 0.622163162 0.372832405 1.585812035
## [46] -0.441089963 -0.176820835 -0.062864145 0.234577930 -0.684381435
## [51] -1.167254099 -0.157708228 0.676462784 -0.757715106 -1.029471543
## [56] -0.518642559 -0.468960676 0.280037647 0.511469191 -1.107285697
## [61] -0.119648113 0.967421566 -0.107002537 0.202406436 1.084381938
## [66] 1.142773047 -0.075458866 -0.957438716 -0.037702589 -1.483865480
## [71] 0.189598120 -0.749370236 -0.645200916 -0.791638608 0.899434908
## [76] 0.132312852 0.138653062 0.441089963 0.547609740 -1.378918772
## [81] -0.808945725 0.107002537 -1.764224226 -0.708400243 1.861620217
## [86] -0.202406436 1.179761118 -0.490104222 -0.853316686 -1.205344920
## [91] 0.599450994 -0.260489498 0.228125248 0.997966220 -0.164072354
## [96] 0.937753841 -1.585812035 0.692343235 -0.732834875 -2.137203375
## [101] -2.079254280 0.448024745 -0.386335624 -0.947550382 0.468960676
## [106] 0.716497500 -1.347109832 -1.062033337 -0.366106357 -0.234577930
## [111] 0.319465652 0.221682051 -0.928046482 -1.040202966 1.095761965
## [116] -1.008356792 -0.584480259 1.980752397 1.465233793 -0.511469191
## [121] -0.346025823 -1.095761965 -1.273029655 -1.937931511 -0.716497500
## [126] 1.764224226 -0.006282318 0.000000000 1.681160057 0.100686285
## [131] 2.375107084 0.006282318 -0.018847945 0.518642559 -0.497200571
## [136] -0.461959623 -0.835371144 1.543097927 -1.118958381 0.569639391
## [141] 1.301806749 0.987682290 0.273510070 -0.195998259 0.406724252
## [146] -0.081760594 -1.154927051 0.195998259 -0.741077227 1.483865480
## [151] 0.540325710 -0.228125248 0.260489498 0.928046482 -0.483032470
## [156] -0.138653062 -0.406724252 -0.614557305 -0.880762990 1.018857387
## [161] -0.069160134 -0.676462784 -0.050279388 -0.359396830 1.331704246
## [166] 0.684381435 -1.018857387 0.352703444 -0.043990118 0.817690678
## [171] -0.622163162 0.844309926 -0.125977957 0.069160134 -0.326083868
## [176] 0.490104222 -0.293128990 -1.073140638 -0.208822935 0.826498615
## [181] 1.205344920 -2.203366572 -1.655888698 -0.937753841 -0.221682051
## [186] 1.008356792 -1.465233793 0.170443132 -0.525842714 -0.817690678
## [191] 0.880762990 0.346025823 0.379575363 -1.395360129 0.286577179
## [196] -0.660751127 -1.794824260 -0.144998850 -1.316608391 0.908889378
## [201] 0.208822935 -0.540325710 1.937931511 3.022583937 0.853316686
## [206] 0.420410685 1.707553094 -0.088065570 0.012564883 0.018847945
## [211] -0.562265945 -0.899434908 0.113323060 1.564098295 0.660751127
## [216] -0.286577179 0.606986835 -0.454981140 0.306270765 0.025131751
## [221] 0.144998850 1.631632667 0.176820835 1.118958381 0.504322046
## [226] -0.724642010 -0.100686285 -1.331704246 -0.533070235 0.918425797
## [231] -0.113323060 0.475984791 -0.379575363 0.554922943 0.031416549
## [236] -0.413557785 -0.241040394 -0.170443132 0.591949043 -0.393113587
## [241] 0.386335624 -1.287284949 0.088065570 -0.599450994 -1.192455456
## [246] -0.434176329 0.081760594 -1.142773047 -0.448024745 -0.427283386
## [251] 1.107285697 -0.890060160 -1.898394677 1.073140638 -1.861620217
## [256] 0.461959623 -1.543097927 1.245269831 -1.259028466 -0.987682290
## [261] 0.299693408 -1.503027005 -0.967421566 1.029471543 -1.564098295
## [266] 1.259028466 -1.179761118 1.287284949 -0.306270765 -0.606986835
## [271] 0.332716397 0.157708228 -0.132312852 -2.280865771 -1.707553094
## [276] 0.584480259 -1.608300307 0.562265945 -1.301806749 -1.735192204
## [281] 1.347109832 2.079254280 -0.554922943 -0.591949043 1.051055539
## [286] -0.668586325 -0.766113077 -0.826498615 -1.681160057 -1.429424692
## [291] -0.918425797 -0.299693408 0.749370236 1.794824260 0.957438716
## [296] -1.245269831 0.427283386 -0.183205739 0.359396830 -2.496817918
## [301] -1.827203533 0.266994125 1.447097300 0.094374049 0.037702589
## [306] -0.774565430 0.732834875 2.280865771 -1.130785550 0.483032470
## [311] -1.980752397 -0.504322046 0.652956285 1.608300307 2.203366572
## [316] -2.375107084 0.413557785 2.672947708 2.496817918 -0.273510070
## [321] 1.273029655 -0.215248044 1.735192204 -1.051055539 -2.027546869
## [326] 0.312861400 -0.569639391 0.890060160 -1.218437810 -3.022583937
## [331] 1.827203533 1.429424692 1.062033337 0.151350483 0.724642010
## [336] 1.316608391 -0.420410685 -1.522756851 0.808945725 -0.094374049
## [341] 2.137203375 1.362841938 0.525842714 0.497200571 -0.319465652
## [346] 0.393113587 -0.997966220 -0.332716397 0.708400243 1.395360129
## [351] 0.757715106 -0.637484161 0.871541335 -0.339363596 -0.280037647
## [356] 0.835371144 -0.862393206 1.378918772 -1.447097300 -0.871541335
## [361] -0.025131751 1.167254099 0.119648113 1.231742970 -0.372832405
## [366] 0.241040394 -0.844309926 1.503027005 0.766113077 0.791638608
## [371] 0.533070235 -1.412187579 0.339363596 -0.475984791 -0.189598120
## [376] 0.645200916 -0.908889378 -1.231742970 -0.399909659 0.293128990
## [381] -0.151350483 0.629805182 0.366106357 1.898394677 0.741077227
## [386] 1.522756851 -0.629805182 0.062864145 0.043990118 0.050279388
## [391] 0.862393206 -0.700349168 0.577043938 0.056570646 0.614557305
## [396] -1.362841938 -0.266994125 -0.352703444 -0.783073486
##
## $FTSE$y
## [1] 0.679325585 -0.487765222 0.906788661 0.578853627 -0.720408902
## [6] 0.855359170 0.823988102 0.083718705 -0.521808405 1.405461680
## [11] 0.959526160 0.164267835 -0.534947286 -0.227692066 0.668896321
## [16] 1.149110807 -0.285946134 -0.034876962 0.376027291 0.220136716
## [21] 0.023121387 -0.261981815 0.112021014 0.385847127 -0.626513434
## [26] -0.468012687 0.936540629 0.123200123 -1.153579943 -0.046681709
## [31] 0.603253678 0.924600565 0.321987121 0.145193336 -3.071346814
## [36] 0.551072623 1.855549031 0.810945847 0.674799848 0.000000000
## [41] -0.791456811 0.167951752 0.533495923 0.284284740 1.281324413
## [46] -0.395581430 -0.164855751 -0.048787811 0.153944355 -0.532353603
## [51] -0.844263531 -0.159647256 0.582502094 -0.609409894 -0.754055907
## [56] -0.445126631 -0.416281221 0.197398978 0.448101364 -0.799907703
## [61] -0.112424889 0.822789723 -0.084687043 0.130990908 0.873412851
## [66] 0.911622230 -0.052918053 -0.703426367 -0.038086533 -1.082069649
## [71] 0.127108851 -0.592421620 -0.514685964 -0.614594679 0.763209393
## [76] 0.085453486 0.089261458 0.376114773 0.479004906 -0.976509938
## [81] -0.628955235 0.062512209 -1.280699699 -0.537910849 1.741758460
## [86] -0.203244088 0.932127051 -0.430716697 -0.643024162 -0.851147284
## [91] 0.518237202 -0.263686095 0.149948702 0.827423168 -0.160218835
## [96] 0.806293788 -1.125994952 0.592970744 -0.585571518 -1.716013508
## [101] -1.590155420 0.385692826 -0.368033649 -0.698193627 0.404692801
## [106] 0.622913444 -0.971070200 -0.772216547 -0.345878284 -0.218990166
## [111] 0.219470786 0.148748037 -0.693126496 -0.760282509 0.874952903
## [116] -0.730411687 -0.493311037 1.810772204 1.167828994 -0.440528634
## [121] -0.323664372 -0.793291956 -0.911501492 -1.400735909 -0.538569187
## [126] 1.662829368 0.000000000 0.000000000 1.438517027 0.053747881
## [131] 3.020661157 0.000000000 -0.012033212 0.453305520 -0.435286131
## [136] -0.413123696 -0.636352652 1.248429330 -0.800672565 0.492352395
## [141] 1.052166580 0.826610500 0.177367861 -0.192791942 0.323254622
## [146] -0.058941412 -0.841393410 0.130848533 -0.590028907 1.175111536
## [151] 0.476396709 -0.215517241 0.168859219 0.799749098 -0.427815806
## [156] -0.132802125 -0.379380476 -0.502532292 -0.674742532 0.842205625
## [161] -0.051213363 -0.528162075 -0.043586797 -0.344882264 1.078006285
## [166] 0.586383314 -0.751203099 0.264122679 -0.043249194 0.684419620
## [171] -0.503965308 0.714622271 -0.116959064 0.003903201 -0.304437766
## [176] 0.434561328 -0.272861932 -0.785647280 -0.204861521 0.694800837
## [181] 0.944838672 -2.035109523 -1.153663178 -0.693859544 -0.214054927
## [186] 0.829724370 -1.063744380 0.117661379 -0.445777273 -0.635024017
## [191] 0.725112659 0.252165779 0.296158059 -0.982930184 0.204256710
## [196] -0.521831302 -1.290930700 -0.132857261 -0.943709986 0.763839342
## [201] 0.137448457 -0.457532651 1.805114491 5.590215071 0.715229729
## [206] 0.366653802 1.526629494 -0.060601470 0.000000000 0.000000000
## [211] -0.485105738 -0.685505370 0.076692998 1.272128132 0.575104048
## [216] -0.270859980 0.524330441 -0.405268490 0.214762066 0.000000000
## [221] 0.090232348 1.371046503 0.118575610 0.880861616 0.443922662
## [226] -0.562495434 -0.069791361 -0.948355082 -0.449029577 0.782822635
## [231] -0.110963160 0.418425535 -0.365057709 0.481125093 0.000000000
## [236] -0.383057090 -0.221844265 -0.163047506 0.497364709 -0.369330773
## [241] 0.307680902 -0.923907018 0.037300906 -0.499645774 -0.850665168
## [246] -0.393075818 0.026561433 -0.834566215 -0.397842470 -0.387909513
## [251] 0.875231339 -0.684172304 -1.373922414 0.862371717 -1.334726091
## [256] 0.403874054 -1.093493712 0.975282319 -0.907206820 -0.722149876
## [261] 0.214643453 -1.082817706 -0.713741529 0.848107912 -1.125305354
## [266] 1.000405022 -0.846132253 1.023214430 -0.284238761 -0.501846796
## [271] 0.229996368 0.096618357 -0.120656371 -2.073769832 -1.159587154
## [276] 0.495070100 -1.146713032 0.485782487 -0.929360283 -1.228335857
## [281] 1.081771721 2.098255667 -0.478705844 -0.497594958 0.858476413
## [286] -0.524750021 -0.610591900 -0.635239050 -1.156628533 -1.038253691
## [291] -0.692264695 -0.281434015 0.646954105 1.673856773 0.818906993
## [296] -0.900635495 0.373720644 -0.173471546 0.267017038 -2.307984952
## [301] -1.302410108 0.175361683 1.164113786 0.043260080 0.000000000
## [306] -0.614027502 0.635224504 2.978815391 -0.827070826 0.423334180
## [311] -1.454346177 -0.436326304 0.562835661 1.294539862 2.159517483
## [316] -2.151025969 0.350210970 4.440146323 3.345545312 -0.268796260
## [321] 1.011679231 -0.212683681 1.577213718 -0.770639402 -1.576316801
## [326] 0.214843750 -0.487234457 0.755973365 -0.878591144 -4.055379064
## [331] 1.720966357 1.153351551 0.862103214 0.094532850 0.629623800
## [336] 1.075394963 -0.386892096 -1.091389288 0.679337155 -0.066305238
## [341] 2.138786980 1.096675583 0.468685036 0.436401941 -0.303404877
## [346] 0.308085362 -0.726646191 -0.305614247 0.605533058 1.109731783
## [351] 0.662251656 -0.513749261 0.720734109 -0.309837335 -0.270100270
## [356] 0.712324701 -0.655713549 1.097597152 -1.060005869 -0.663577386
## [361] -0.014927601 0.925649448 0.081360947 0.968147218 -0.347679696
## [366] 0.154247310 -0.641707308 1.188367287 0.667444744 0.677511684
## [371] 0.475025191 -0.999283668 0.249629174 -0.418621436 -0.177574835
## [376] 0.555454710 -0.689580475 -0.879776057 -0.377773703 0.206170385
## [381] -0.143287530 0.548217374 0.274443794 1.802722330 0.645230670
## [386] 1.221640489 -0.513722730 0.003536818 0.000000000 0.000000000
## [391] 0.717948718 -0.537256830 0.494263019 0.000000000 0.526962937
## [396] -0.975013105 -0.268210051 -0.336164190 -0.614237529
qqline(eu_percentreturns)
par(mfrow=c(1, 1))
# Make a scatterplot of DAX and FTSE
plot(eu_stocks[,"DAX"], eu_stocks[,"FTSE"])
# Make a scatterplot matrix of eu_stocks
pairs(eu_stocks)
# Convert eu_stocks to log returns
logreturns <- diff(log(eu_stocks))
# Plot logreturns
plot(logreturns)
# Make a scatterplot matrix of logreturns
pairs(logreturns)
DAX_logreturns <- logreturns[,"DAX"]
FTSE_logreturns <- logreturns[,"FTSE"]
# Use cov() with DAX_logreturns and FTSE_logreturns
cov(DAX_logreturns, FTSE_logreturns)
## [1] 5.092401e-05
# Use cov() with logreturns
cov(logreturns)
## DAX SMI CAC FTSE
## DAX 9.883355e-05 6.840581e-05 8.373055e-05 5.092401e-05
## SMI 6.840581e-05 7.927600e-05 7.327089e-05 4.880343e-05
## CAC 8.373055e-05 7.327089e-05 1.357431e-04 6.848845e-05
## FTSE 5.092401e-05 4.880343e-05 6.848845e-05 8.353753e-05
# Use cor() with DAX_logreturns and FTSE_logreturns
cor(DAX_logreturns, FTSE_logreturns)
## [1] 0.5604406
# Use cor() with logreturns
cor(logreturns)
## DAX SMI CAC FTSE
## DAX 1.0000000 0.7728049 0.7228911 0.5604406
## SMI 0.7728049 1.0000000 0.7063203 0.5997064
## CAC 0.7228911 0.7063203 1.0000000 0.6431579
## FTSE 0.5604406 0.5997064 0.6431579 1.0000000
xData <- c( 2.07, 1.3, 0.03, -0.34, 0.23, 0.47, 4.34, 2.82, 2.91, 2.33, 1.16, 0.82, -0.24, -0.03, -1.54, -0.69, -1.42, -0.77, 0.84, 0.04, 1.07, 1.5, -0.21, 0.33, -0.75, -0.11, 0.2, -0.17, 0.87, 1.47, 0.84, 0.96, 0.67, -0.26, 0.08, -1.46, -1.27, -2.19, -2.21, 0.42, -1.02, -1.54, -0.73, 0.7, -0.36, -0.77, -0.5, 1.31, 1.16, 0.69, -0.79, 0.33, 2.01, 1.71, 1, 0.69, 0.66, 1.51, 0.86, 1.97, 2.98, 3.02, 1.3, 0.71, 0.41, -0.53, -0.21, 1.73, -0.76, -1.34, -1.72, -2.78, -1.73, -3.49, -2.42, -0.14, -0.16, -0.28, -0.97, -1.53, -1.04, -1.26, -1.44, -1.24, -0.45, 1.13, 3.26, 1.14, 0.99, 0.38, 2.71, 2.42, 1.79, -1.03, -1.07, -2.63, -2.67, -1.3, -1.04, 0.4, -0.49, -0.49, -1.08, -0.27, -1.84, -2.1, -1.89, -1.85, -0.34, -1.21, -0.5, -0.58, -1.67, -1.41, -2.55, -0.87, -2.17, -2.6, -2.06, -0.88, 1.33, 1.08, -0.96, -1.81, -2.06, -2.34, -0.01, 0.77, 0.03, 1.17, 2.68, 4.58, 4.91, 4.13, 4.04, 1.35, 0.61, 1.43, 0.79, 1.34, 2.22, 2.83, 2.43, 1.89, 0.47, -1.31, -1.46, 0.21, 1.1, 1.42 )
x <- ts(data=xData, start=c(1, 1), frequency=1)
n <- length(x)
# Define x_t0 as x[-1]
x_t0 <- x[-1]
# Define x_t1 as x[-n]
x_t1 <- x[-n]
# Confirm that x_t0 and x_t1 are (x[t], x[t-1]) pairs
head(cbind(x_t0, x_t1))
## x_t0 x_t1
## [1,] 1.30 2.07
## [2,] 0.03 1.30
## [3,] -0.34 0.03
## [4,] 0.23 -0.34
## [5,] 0.47 0.23
## [6,] 4.34 0.47
# Plot x_t0 and x_t1
plot(x_t0, x_t1)
# View the correlation between x_t0 and x_t1
cor(x_t0, x_t1)
## [1] 0.7630798
# Use acf with x
acf(x, lag.max = 1, plot = FALSE)
##
## Autocorrelations of series 'x', by lag
##
## 0 1
## 1.000 0.758
# Confirm that difference factor is (n-1)/n
cor(x_t1, x_t0) * (n-1)/n
## [1] 0.7579926
# Generate ACF estimates for x up to lag-10
acf(x, lag.max = 10, plot = FALSE)
##
## Autocorrelations of series 'x', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 0.758 0.537 0.345 0.226 0.198 0.140 0.114 0.124 0.118 0.100
# Type the ACF estimate at lag-10
0.1 # may differ slightly due rounding
## [1] 0.1
# Type the ACF estimate at lag-5
0.198 # may differ slightly due rounding
## [1] 0.198
xData <- c( -0.037, -0.677, -0.735, -1.531, -2.27, -1.966, -0.964, -0.525, -0.894, -0.589, 1.174, 0.237, 0.495, 0.451, -0.075, 0.394, 1.694, 0.129, -0.378, 0.683, 1.725, 1.441, 0.601, 0.057, 0.066, -1.115, -0.638, -2.109, -1.634, -0.974, -3.366, -3.009, -4.468, -4.133, -5.638, -5.004, -3.228, -2.902, -2.652, -2.295, -3.406, -2.196, -0.02, 0.008, -1.067, -0.586, 0.362, -0.791, -0.724, -0.238, -0.006, -0.887, -1.354, -2.613, -1.704, -0.967, 0.407, 1.216, 2.585, 4.095, 1.323, 2.301, 1.051, 1.035, 0.328, -0.254, 0.115, -0.096, -1.291, -2.435, -0.34, -0.161, -0.194, 0.013, 0.67, 0.258, 0.408, 0.635, 0.787, 0.211, 0.571, 1.452, 1.149, 3.41, 0.329, 0.494, -0.782, -1.251, -2.175, -1.332, -0.258, 0.696, 1.803, 1.134, 0.341, 1.206, 2.518, 1.459, -0.077, -1.048, 0.459, -0.119, 0.019, 0.481, 0.53, 3.184, 2.545, 3.264, 1.889, 1.813, 0.152, -0.589, 0.69, -0.72, -0.858, -1.287, -1.528, -1.207, -2.333, -2.767, -3.079, -1.889, -1.805, -1.725, -2.02, -1.885, -1.857, -0.569, 0.45, -0.685, 0.144, -0.459, -0.716, 0.009, -0.269, 0.408, 1.515, 1.918, 2.316, 0.864, 0.868, -0.244, -1.638, -2.346, -0.934, -0.703, -1.651, -1.456, -0.166, -0.33 )
yData <- c( -1.363, -2.007, 1.459, 5.736, -0.604, -1.295, 1.261, 5.438, -1.159, -2.092, 1.03, 5.792, -0.529, 0.499, 0.937, 4.712, 2.557, 1.319, 2.033, 4.465, 1.995, 1.54, -0.411, 4.891, 0.482, 2.582, -0.763, 5.177, 0.569, 3.998, 0.479, 3.462, -0.742, 3.582, -1.834, 3.307, 0.894, 4.393, -0.535, 3.215, 0.605, 4.754, 0.364, 2.099, 2.121, 4.177, 1.053, 2.481, 3.878, 4.343, 2.663, 1.744, 6.083, 4.762, 1.744, 2.017, 6.513, 5.345, 0.633, 3.043, 5.872, 4.106, 0.143, 2.816, 5.296, 3.718, 1.703, 2.252, 4.088, 3.576, 1.084, 0.592, 2.83, 3.034, 1.845, 0.255, 3.195, 1.867, 0.608, 2.624, 3.104, 2.17, -0.087, 3.059, 3.751, 1.832, 0.933, 4.723, 2.821, 1.332, 0.24, 4.433, 3.374, 0.928, 2.101, 4.943, 3.517, 1.842, 0.582, 4.262, 2.347, 0.123, 0.035, 5.626, 4.225, 0.695, 0.846, 6.523, 2.926, 0.766, 0.242, 5.072, 2.156, 0.569, -1.052, 4.85, 1.204, 2.729, 0.828, 1.481, -1.803, 2.223, 0.816, 1.572, -1.601, 0.099, 1.694, 1.615, -2.158, 0.272, 1.636, 1.477, -2.183, 0.722, 1.851, 0.814, -1.248, 0.496, 2.982, 1.452, -1.673, 0.229, 2.828, 2.407, -0.046, 1.626, 5.61, 2.945, -0.771, 0.444 )
zData <- c( 0.316, 1.735, -0.009, 0.814, -0.929, -1.153, 0.863, 0.531, -1.166, -1.813, 1.612, 0.027, -0.441, 0.522, 0.67, 0.661, -0.603, 0.311, -0.495, -1.107, 0.571, -1.002, 0.257, 0.329, -1.939, -0.857, -1.363, -0.572, 0.805, -0.496, 0.174, -0.504, 0.131, 0.421, -0.229, -0.578, -0.469, 0.364, -0.866, 0.423, 0.464, -0.792, -0.764, -0.55, 0.566, 0.145, 0.483, 0.475, -0.17, 1.205, 0.776, -0.033, 0.118, 0.234, 0.127, 0.95, 0.448, -0.959, 1.425, 0.502, -2.396, 0.047, -0.168, 0.663, 0.181, 0.22, -1.99, 1.079, -0.868, 0.686, 0.482, -2.113, 1.368, 1.464, 0.072, 0.302, -1.101, 0.116, -0.043, 0.137, 0.362, -0.192, -0.305, 3.129, -0.378, 0.717, -0.711, 0.181, 0.689, 0.816, -0.799, 0.044, 0.54, -0.622, 0.545, -0.365, -0.759, -1.492, -1.17, -1.567, -1.613, 1.255, -0.322, 1.431, -0.316, 0.166, 0.194, -0.799, -1.252, -2.43, 0.18, -0.308, 0.504, -0.442, -0.364, -2.189, 0.526, -0.485, 0.211, -0.097, -0.966, 0.016, -0.06, -0.155, 0.101, 0.062, -0.735, -0.318, 1.038, 1.085, 0.691, 0.86, 0.432, 1.346, 1.928, 0.015, 0.971, 0.305, -0.772, -1.538, -1.304, -0.64, 1.134, 0.03, 0.739, 1.925, 0.988, 1.01, -0.214, 1.478 )
x <- ts(data=xData, start=c(1, 1), frequency=1)
y <- ts(data=yData, start=c(1, 1), frequency=1)
z <- ts(data=zData, start=c(1, 1), frequency=1)
plot(cbind(x, y, z))
# View the ACF of x
acf(x)
# View the ACF of y
acf(y)
# View the ACF of z
acf(z)
Chapter 4 - Autoregression
Autoregressive Model - where current observations are highly dependent on previous observations:
AR Model Estimation and Forecasting - example from Mishkin data in package Ecdat:
Example code includes:
# Simulate an AR model with 0.5 slope
x <- arima.sim(model = list(ar=0.5), n = 100)
# Simulate an AR model with 0.9 slope
y <- arima.sim(model = list(ar=0.9), n = 100)
# Simulate an AR model with -0.75 slope
z <- arima.sim(model = list(ar=-0.75), n = 100)
# Plot your simulated data
plot.ts(cbind(x, y, z))
# Calculate the ACF for x
acf(x)
# Calculate the ACF for y
acf(y)
# Calculate the ACF for z
acf(z)
# Simulate and plot AR model with slope 0.9
x <- arima.sim(model = list(ar=0.9), n = 200)
ts.plot(x)
acf(x)
# Simulate and plot AR model with slope 0.98
y <- arima.sim(model = list(ar=0.98), n = 200)
ts.plot(y)
acf(y)
# Simulate and plot RW model
z <- arima.sim(model = list(order=c(0, 1, 0)), n = 200)
ts.plot(z)
acf(z)
xData <- c( 0.829, 0.458, 0.053, 0.063, -0.736, -0.568, -0.056, -0.148, -0.461, -0.757, -1.571, -0.231, -1.261, -0.738, -0.75, -1.921, -2.473, -3.552, -1.912, -4.195, -2.818, -3.139, -1.296, -0.796, 0.83, -0.21, -0.313, 0.059, 1.527, 3.761, 3.255, 2.586, 1.214, 1.49, 2.389, 3.566, 3.843, 4.94, 4.685, 3.247, 2.398, 2.107, 1.644, -0.185, -1.972, -0.343, -2.117, -2.693, -2.261, -2.456, -2.08, -2.385, -1.553, -2.665, -3.956, -2.091, -1.692, -1.303, -2.698, -2.093, -2.658, -2.572, -1.599, -1.713, -1.587, -1.103, -1.194, -1.333, -0.3, -0.218, 1.675, 1.199, 1.165, 1.657, -0.531, -0.923, -0.912, -0.691, -0.517, -0.811, 1.785, 3.082, 1.498, 1.814, 2.774, 2.592, 2.433, 0.699, -0.315, -1.049, 1.062, 1.694, 2.755, 1.546, 0.908, 2.491, 1.926, -0.296, -0.731, -1.395 )
x <- ts(data=xData, start=c(1, 1), frequency=1)
str(x)
## Time-Series [1:100] from 1 to 100: 0.829 0.458 0.053 0.063 -0.736 -0.568 -0.056 -0.148 -0.461 -0.757 ...
# Fit the AR model to x
arima(x, order = c(1, 0, 0))
##
## Call:
## arima(x = x, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.8575 -0.0948
## s.e. 0.0491 0.6703
##
## sigma^2 estimated as 1.022: log likelihood = -143.66, aic = 293.32
# Copy and paste the slope (ar1) estimate
0.8575 #
## [1] 0.8575
# Copy and paste the slope mean (intercept) estimate
-0.0948 #
## [1] -0.0948
# Copy and paste the innovation variance (sigma^2) estimate
1.022 #
## [1] 1.022
data(AirPassengers, package="datasets")
# Fit the AR model to AirPassengers
AR <- arima(AirPassengers, order = c(1, 0, 0))
print(AR)
##
## Call:
## arima(x = AirPassengers, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.9646 278.4649
## s.e. 0.0214 67.1141
##
## sigma^2 estimated as 1119: log likelihood = -711.09, aic = 1428.18
# Run the following commands to plot the series and fitted values
ts.plot(AirPassengers)
AR_fitted <- AirPassengers - residuals(AR)
points(AR_fitted, type = "l", col = 2, lty = 2)
data(Nile, package="datasets")
# Fit an AR model to Nile
AR_fit <-arima(Nile, order = c(1, 0, 0))
print(AR_fit)
##
## Call:
## arima(x = Nile, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.5063 919.5685
## s.e. 0.0867 29.1410
##
## sigma^2 estimated as 21125: log likelihood = -639.95, aic = 1285.9
# Use predict() to make a 1-step forecast
predict_AR <- predict(AR_fit)
# Obtain the 1-step forecast using $pred[1]
predict(AR_fit)$pred[1]
## [1] 828.6576
# Use predict to make 1-step through 10-step forecasts
predict(AR_fit, n.ahead = 10)
## $pred
## Time Series:
## Start = 1971
## End = 1980
## Frequency = 1
## [1] 828.6576 873.5426 896.2668 907.7715 913.5960 916.5448 918.0377
## [8] 918.7935 919.1762 919.3699
##
## $se
## Time Series:
## Start = 1971
## End = 1980
## Frequency = 1
## [1] 145.3439 162.9092 167.1145 168.1754 168.4463 168.5156 168.5334
## [8] 168.5380 168.5391 168.5394
# Run to plot the Nile series plus the forecast and 95% prediction intervals
ts.plot(Nile, xlim = c(1871, 1980))
AR_forecast <- predict(AR_fit, n.ahead = 10)$pred
AR_forecast_se <- predict(AR_fit, n.ahead = 10)$se
points(AR_forecast, type = "l", col = 2)
points(AR_forecast - 2*AR_forecast_se, type = "l", col = 2, lty = 2)
points(AR_forecast + 2*AR_forecast_se, type = "l", col = 2, lty = 2)
Chapter 5 - Simple Moving Average
Simple Moving Average Model - weighted average of current and previous noise:
MA Model Estimation and Forecasting - inflation data available in Ecdat::Mishkin:
Compute the AR and MA models - differences and implications for usage:
Example code includes:
# Generate MA model with slope 0.5
x <- arima.sim(model = list(ma=0.5), n = 100)
# Generate MA model with slope 0.9
y <- arima.sim(model = list(ma=0.9), n = 100)
# Generate MA model with slope -0.5
z <- arima.sim(model = list(ma=-0.5), n = 100)
# Plot all three models together
plot.ts(cbind(x, y, z))
# Calculate ACF for x
acf(x)
# Calculate ACF for y
acf(y)
# Calculate ACF for z
acf(z)
xData <- c( -0.291, 0.378, -0.413, 0.791, 2.626, 1.955, 1.321, -0.563, -1.005, -1.945, -1.3, -0.968, -1.621, -0.247, -0.911, -0.036, 0.203, 0.323, 1.032, -0.066, 1.104, 3.577, 1.925, 0.255, 0.092, 0.832, 0.578, -1.189, -0.927, -0.288, 0.092, -0.248, -1.739, 0.599, 1.404, 1.942, 2.002, 2.473, 2.005, -0.547, -0.085, 0.055, 1.08, 0.091, 0.038, 1.062, -0.571, -0.149, -0.297, -2.916, -0.892, 0.064, -1.894, -0.821, 0.296, 1.245, 2.076, 0.82, -0.445, -0.619, -0.308, -0.779, -0.619, 0.541, 0.313, -0.416, -0.637, -1.198, 0.382, 0.011, -0.55, 0.272, -1.323, -1.865, -1.996, 0.091, -1.318, -1.269, 0.259, 0.987, 1.746, 1.88, 0.435, -0.986, 0.229, 1.781, 3.713, 2.018, -0.461, -1.422, -0.604, 1.405, 2.359, 1.908, 2.052, 1.572, -0.755, -1.396, -0.522, -0.298 )
x <- ts(data=xData, start=c(1, 1), frequency=1)
str(x)
## Time-Series [1:100] from 1 to 100: -0.291 0.378 -0.413 0.791 2.626 ...
# Fit the MA model to x
arima(x, order = c(0, 0, 1))
##
## Call:
## arima(x = x, order = c(0, 0, 1))
##
## Coefficients:
## ma1 intercept
## 0.7927 0.1590
## s.e. 0.0902 0.1747
##
## sigma^2 estimated as 0.9576: log likelihood = -140.22, aic = 286.45
# Paste the slope (ma1) estimate below
0.7928 #
## [1] 0.7928
# Paste the slope mean (intercept) estimate below
0.1589 #
## [1] 0.1589
# Paste the innovation variance (sigma^2) estimate below
0.9576 #
## [1] 0.9576
# Fit the MA model to Nile
MA <- arima(Nile, order = c(0, 0, 1))
print(MA)
##
## Call:
## arima(x = Nile, order = c(0, 0, 1))
##
## Coefficients:
## ma1 intercept
## 0.3783 919.2433
## s.e. 0.0791 20.9685
##
## sigma^2 estimated as 23272: log likelihood = -644.72, aic = 1295.44
# Plot Nile and MA_fit
ts.plot(Nile)
MA_fit <- Nile - resid(MA)
points(MA_fit, type = "l", col = 2, lty = 2)
# Make a 1-step forecast based on MA
predict_MA <- predict(MA)
# Obtain the 1-step forecast using $pred[1]
predict_MA$pred[1]
## [1] 868.8747
# Make a 1-step through 10-step forecast based on MA
predict(MA, n.ahead=10)
## $pred
## Time Series:
## Start = 1971
## End = 1980
## Frequency = 1
## [1] 868.8747 919.2433 919.2433 919.2433 919.2433 919.2433 919.2433
## [8] 919.2433 919.2433 919.2433
##
## $se
## Time Series:
## Start = 1971
## End = 1980
## Frequency = 1
## [1] 152.5508 163.1006 163.1006 163.1006 163.1006 163.1006 163.1006
## [8] 163.1006 163.1006 163.1006
# Plot the Nile series plus the forecast and 95% prediction intervals
ts.plot(Nile, xlim = c(1871, 1980))
MA_forecasts <- predict(MA, n.ahead = 10)$pred
MA_forecast_se <- predict(MA, n.ahead = 10)$se
points(MA_forecasts, type = "l", col = 2)
points(MA_forecasts - 2*MA_forecast_se, type = "l", col = 2, lty = 2)
points(MA_forecasts + 2*MA_forecast_se, type = "l", col = 2, lty = 2)
# These should actually be from fitting MA and AR to the Nile data
ARFitData <- c( 947.15, 1021.04, 1041.29, 941.56, 1066.61, 1041.29, 1041.29, 865.62, 1076.73, 1147.61, 1031.17, 957.76, 927.38, 1015.98, 957.25, 970.41, 940.04, 1051.42, 858.53, 939.03, 1031.17, 1010.92, 1066.61, 1036.23, 1086.86, 1091.92, 1071.67, 975.48, 1010.92, 845.87, 879.29, 896.5, 805.37, 929.91, 875.74, 808.91, 917.76, 804.36, 970.41, 985.6, 944.59, 874.73, 821.57, 684.88, 871.18, 809.42, 1021.04, 1010.92, 875.23, 840.81, 869.67, 842.83, 881.82, 891.44, 890.42, 807.39, 881.82, 830.68, 857.01, 980.54, 838.28, 849.41, 891.94, 881.82, 931.94, 952.19, 908.14, 870.17, 965.35, 844.35, 796.26, 782.59, 882.32, 865.11, 829.67, 859.54, 980.54, 889.41, 896.5, 883.34, 904.6, 830.68, 833.21, 878.27, 985.6, 918.77, 953.2, 857.52, 921.31, 947.63, 866.63, 970.41, 912.7, 910.17, 1046.36, 915.74, 831.7, 919.28, 817.52, 815.49 )
MAFitData <- c( 932.23, 987.22, 984, 911.36, 1032.19, 967.59, 992.03, 851.52, 1062.41, 1035.6, 958.74, 932.96, 920.01, 991.11, 920.34, 956.94, 920.4, 1017.44, 836.61, 965.16, 985.38, 962.6, 1012.83, 971.13, 1024.73, 1008.24, 999.35, 930.84, 983.23, 840.1, 919.21, 902.14, 840.51, 956.88, 872.38, 854.41, 942.54, 824.47, 993.21, 940.73, 929.94, 881.82, 860.3, 766.31, 941.07, 828.81, 1029.39, 945.95, 876.14, 876.82, 898.13, 870.02, 909.78, 901.93, 904.14, 841.27, 920.66, 852.42, 897.9, 973, 838.29, 897.57, 906.92, 895.82, 937.47, 936.84, 904.17, 888.16, 965.33, 845.73, 855.04, 841.3, 921.02, 878, 867.8, 893.98, 974.48, 875.94, 918.51, 892.57, 918.27, 853.32, 879.78, 903.44, 974.68, 897.8, 952.61, 860.38, 942.93, 931.37, 875.22, 974.01, 893.52, 922.07, 1013.03, 881.03, 868.17, 938.47, 835.84, 873.15 )
AR_fit <- ts(data=ARFitData, start=c(1871, 1), frequency=1)
MA_fit <- ts(data=MAFitData, start=c(1871, 1), frequency=1)
# Find correlation between AR_fit and MA_fit
cor(AR_fit, MA_fit)
## [1] 0.9401758
# Need to create AR and MA, though the MA model is probably already OK from exercises above
# Find AIC of AR
AIC(AR)
## [1] 1428.179
# Find AIC of MA
AIC(MA)
## [1] 1295.442
# Find BIC of AR
BIC(AR)
## [1] 1437.089
# Find BIC of MA
BIC(MA)
## [1] 1303.257
Chapter 1 - Time Series Data and Models
Time series is a sequence of data in chronological order (recorded sequentially over time), especially common in finance and economics:
Stationarity and Non-Stationarity - definitions, and conversions from non-stationarity to stationarity:
Stationary Time Series - ARMA:
Example code includes:
data(AirPassengers, package="datasets")
data(djia, package="astsa")
data(soi, package="astsa")
# View a detailed description of AirPassengers
help(AirPassengers)
## starting httpd help server ...
## done
# Plot AirPassengers
plot(AirPassengers)
# Plot the DJIA daily closings
plot(djia[,"Close"])
# Plot the Southern Oscillation Index
plot(soi)
yData <- c( 1.0752, -1.2233, -0.8066, 2.2028, -0.1881, 0.909, -1.197, -0.6968, 1.1385, -3.7502, 3.2141, -3.4124, -0.5707, 2.4628, 0.8797, 2.647, 3.3487, 2.1274, 1.4951, -1.0343, -0.2178, 2.5329, -0.3333, -1.1314, 3.4232, -2.6573, 2.3444, 5.107, 2.7611, 0.2877, -1.4333, 2.9236, 0.1324, 4.2033, 0.1539, -0.4517, 5.2934, 0.9239, 6.3714, 6.8761, 2.6617, 4.1279, 6.1697, 2.6619, 2.3581, 8.5626, 3.6387, 3.0449, 1.5867, 5.2176, 5.6889, 2.4215, 3.6722, 3.6326, 4.4526, 5.3535, 6.808, 5.5121, 6.7058, 3.7262, 9.6174, 7.8367, 5.1775, 5.8864, 4.2734, 12.0168, 5.0889, 6.2802, 4.2652, 4.162, 5.9201, 8.9842, 13.745, 9.4167, 8.9174, 7.543, 6.2326, 9.2702, 8.9234, 9.2996, 6.5795, 9.4189, 8.9092, 10.9316, 9.9733, 7.8103, 10.2368, 10.29, 8.6811, 10.3147, 6.7295, 12.7876, 5.988, 9.3356, 10.5408, 10.1422, 10.2608, 9.0473, 11.5869, 13.5886, 9.4664, 7.4157, 11.0767, 14.2901, 11.2511, 11.6835, 11.5153, 9.0543, 11.5185, 11.4878, 9.0081, 11.8876, 10.8354, 8.4025, 11.3758, 10.3381, 10.4919, 14.8334, 11.638, 12.1553, 14.1939, 13.2541, 9.6846, 12.8065, 14.3461, 12.9815, 11.5454, 12.7671, 12.6851, 11.4467, 12.9778, 12.6478, 15.6949, 12.0763, 12.1423, 13.4401, 15.3413, 14.4367, 13.863, 13.1309, 10.9893, 12.3688, 13.5126, 14.678, 15.2781, 15.5538, 14.0693, 14.6665, 15.6628, 14.0735, 15.6187, 14.4782, 15.2514, 13.011, 11.4298, 20.1918, 19.0593, 16.7098, 15.6343, 11.2168, 18.6198, 15.2306, 17.6491, 16.8749, 17.8477, 15.4435, 19.3254, 19.3206, 15.1768, 17.6434, 13.9196, 20.696, 21.2888, 16.4249, 20.2915, 17.4472, 15.4037, 18.6493, 17.7711, 18.5901, 18.5847, 18.4996, 20.1874, 21.1373, 18.3648, 19.7737, 20.3995, 19.5494, 19.2275, 18.8669, 20.7898, 22.0548, 20.5807, 19.3122, 16.1878, 16.5707, 18.108, 22.0924, 22.4979, 19.8109, 21.9049, 24.0603, 20.8068, 23.1255, 20.6354, 23.8614, 17.866, 20.3238, 17.4633, 19.1253, 19.322, 22.6845, 21.8192, 18.6206, 24.9521, 21.9321, 18.4697, 19.5132, 22.2926, 21.4382, 25.9301, 17.8538, 20.7046, 22.3747, 21.0983, 25.7179, 19.8315, 27.5421, 20.7885, 17.8304, 23.0441, 21.0823, 21.6648, 24.2464, 25.5073, 23.7694, 25.6801, 22.9365, 26.6749, 26.6338, 24.3009, 25.5076, 26.2825, 23.9235, 25.9379, 26.9582, 24.2888, 24.6939, 28.6157, 26.6019 )
xData <- c( 2.9859, -6.3616, -0.1457, 4.9285, 3.2626, 3.6556, 4.519, 9.9376, 11.754, 2.3091, 4.4596, -3.359, 3.1244, 4.3235, 3.3884, -1.369, -5.1293, 0.5116, 6.1125, 15.3293, 9.6873, 9.862, 15.9674, 16.3417, 20.5944, 20.2246, 22.4165, 23.8751, 19.2596, 12.6268, 3.4223, 7.8371, 13.6312, 17.4746, 15.231, 17.7947, 12.092, 10.4566, 7.8127, 14.7825, 11.1885, 23.8849, 30.7432, 33.85, 33.4494, 27.2179, 23.1117, 27.1605, 20.3911, 21.1012, 19.1438, 20.0941, 16.1906, 13.7102, 14.6144, 14.9335, 29.1133, 31.3782, 32.7828, 30.4111, 28.2442, 29.0585, 35.9782, 34.9491, 38.223, 31.3179, 29.1704, 22.3349, 16.5423, 23.9608, 20.8017, 19.3039, 19.1387, 13.0404, 9.8801, 3.2505, -4.1992, -7.9626, -4.5083, -6.2854, -2.453, -4.7119, 1.6309, 1.1959, 5.2831, 5.15, 3.72, 0.6658, 2.7384, 8.747, 8.2221, 18.663, 11.3843, 10.3179, 21.0908, 25.0415, 24.7982, 34.6863, 26.3264, 23.3543, 23.7712, 22.7445, 29.2034, 30.2059, 36.2288, 37.6518, 36.3735, 39.842, 27.8231, 26.5969, 26.9149, 24.3732, 28.5127, 26.7399, 30.4023, 39.5915, 44.8034, 44.099, 40.2248, 42.9846, 40.8308, 42.4046, 41.4261, 40.459, 27.9815, 40.4637, 44.3681, 47.9082, 49.0735, 48.4331, 49.8923, 61.6028, 63.6814, 72.3463, 71.1518, 74.7257, 79.1934, 83.1976, 74.4918, 72.1001, 66.1204, 63.7527, 63.148, 67.4173, 74.2575, 68.8726, 68.1953, 70.0591, 71.8744, 73.2482, 79.2107, 78.5204, 87.2619, 87.7628, 91.3676, 93.3275, 97.5043, 103.3569, 94.6093, 91.3573, 85.871, 86.2847, 86.2251, 84.2668, 86.9466, 92.0229, 82.0012, 88.6786, 85.3663, 88.9641, 96.0459, 96.2658, 90.9596, 88.4945, 95.4932, 92.919, 88.7586, 91.0783, 92.4792, 93.5653, 94.3455, 87.9873, 88.7311, 102.6294, 96.466, 92.2194, 91.9247, 84.9855, 90.2585, 82.241, 89.7112, 86.6858, 85.9218, 95.0793, 95.0479, 101.2393, 99.3097, 94.1683, 96.0313, 91.7769, 91.129, 95.5681, 101.2689, 100.3594, 103.8543, 97.5836, 98.9271, 103.799, 105.883, 102.1103, 105.8276, 107.9296, 101.8401, 107.2261, 106.4817, 111.6719, 116.1099, 115.1661, 115.6657, 115.8189, 120.278, 118.6835, 109.1592, 109.7436, 117.1348, 114.0379, 116.9896, 113.5988, 111.9652, 114.1912, 108.2102, 105.3345, 108.2169, 112.0761, 102.6672, 112.187, 113.2779, 112.4105, 103.1019, 98.7301, 103.9845, 97.909, 104.8979, 108.135, 103.5588, 102.4043, 102.0028, 100.3617, 97.9829, 89.8509 )
y <- ts(data=yData, frequency=1, start=c(1, 1)) # trend stationary
x <- ts(data=xData, frequency=1, start=c(1, 1)) # random walk
plot(cbind(y, x))
# Plot detrended y (trend stationary)
plot(diff(y))
# Plot detrended x (random walk)
plot(diff(x))
data(globtemp, package="astsa")
data(cmort, package="astsa")
# Plot globtemp and detrended globtemp
par(mfrow = c(2,1))
plot(globtemp)
plot(diff(globtemp))
# Plot cmort and detrended cmort
par(mfrow = c(2,1))
plot(cmort)
plot(diff(cmort))
par(mfrow=c(1, 1))
data(gnp, package="astsa")
# Plot GNP series (gnp) and its growth rate
par(mfrow = c(2,1))
plot(gnp)
plot(diff(log(gnp)))
# Plot DJIA closings (djia$Close) and its returns
par(mfrow = c(2,1))
plot(djia[,"Close"])
plot(diff(log(djia[,"Close"])))
par(mfrow=c(1, 1))
# Generate and plot white noise
WN <- arima.sim(model=list(order=c(0, 0, 0)), n=200)
plot(WN)
# Generate and plot an MA(1) with parameter .9
MA <- arima.sim(model=list(order=c(0, 0, 1), ma=0.9), n=200)
plot(MA)
# Generate and plot an AR(2) with parameters 1.5 and -.75
AR <- arima.sim(model=list(order=c(2, 0, 0), ar=c(1.5, -.75)), n=200)
plot(AR)
Chapter 2 - Fitting ARMA Models
AR and MA Models have many visual similarities - cannot necessarily distinguish visually:
AR and MA together make an ARMA model - typical for time series, since they are frequently correlated:
Model Choice and Residual Analysis - frequently a good idea to fit several models and then select the best:
Example code includes:
# Generate 100 observations from the AR(1) model
x <- arima.sim(model = list(order = c(1, 0, 0), ar = .9), n = 100)
# Plot the generated data
plot(x)
# Plot the sample P/ACF pair
astsa::acf2(x)
## ACF PACF
## [1,] 0.77 0.77
## [2,] 0.65 0.12
## [3,] 0.53 0.00
## [4,] 0.40 -0.10
## [5,] 0.32 0.02
## [6,] 0.27 0.06
## [7,] 0.12 -0.24
## [8,] 0.03 -0.06
## [9,] 0.02 0.15
## [10,] -0.02 0.00
## [11,] -0.08 -0.16
## [12,] -0.10 -0.04
## [13,] -0.13 0.09
## [14,] -0.16 -0.06
## [15,] -0.15 -0.06
## [16,] -0.23 -0.22
## [17,] -0.26 0.07
## [18,] -0.28 -0.05
## [19,] -0.32 -0.15
## [20,] -0.33 -0.05
# Fit an AR(1) to the data and examine the t-table
astsa::sarima(x, p=1, d=0, q=0)
## initial value 0.330439
## iter 2 value -0.131167
## iter 3 value -0.131173
## iter 4 value -0.131176
## iter 5 value -0.131176
## iter 6 value -0.131178
## iter 7 value -0.131178
## iter 8 value -0.131178
## iter 9 value -0.131178
## iter 10 value -0.131178
## iter 11 value -0.131178
## iter 11 value -0.131178
## final value -0.131178
## converged
## initial value -0.128453
## iter 2 value -0.128520
## iter 3 value -0.128683
## iter 4 value -0.128691
## iter 5 value -0.128696
## iter 6 value -0.128696
## iter 6 value -0.128696
## final value -0.128696
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 xmean
## 0.7723 -0.0631
## s.e. 0.0621 0.3725
##
## sigma^2 estimated as 0.7661: log likelihood = -129.02, aic = 264.05
##
## $degrees_of_freedom
## [1] 98
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.7723 0.0621 12.4451 0.0000
## xmean -0.0631 0.3725 -0.1694 0.8658
##
## $AIC
## [1] 0.7735322
##
## $AICc
## [1] 0.7960322
##
## $BIC
## [1] -0.1743644
x <- arima.sim(model = list(order = c(2, 0, 0), ar = c(1.5, -.75)), n = 200)
# Plot x
plot(x)
# Plot the sample P/ACF of x
astsa::acf2(x)
## ACF PACF
## [1,] 0.86 0.86
## [2,] 0.55 -0.74
## [3,] 0.18 -0.03
## [4,] -0.15 -0.03
## [5,] -0.35 0.14
## [6,] -0.41 -0.12
## [7,] -0.36 0.00
## [8,] -0.25 -0.07
## [9,] -0.12 0.02
## [10,] 0.00 0.08
## [11,] 0.09 0.00
## [12,] 0.14 -0.05
## [13,] 0.15 -0.01
## [14,] 0.12 -0.09
## [15,] 0.05 0.00
## [16,] -0.04 -0.08
## [17,] -0.13 -0.02
## [18,] -0.21 -0.14
## [19,] -0.24 0.05
## [20,] -0.24 -0.08
## [21,] -0.20 -0.09
## [22,] -0.15 -0.03
## [23,] -0.10 -0.13
## [24,] -0.04 0.15
## [25,] 0.04 0.14
# Fit an AR(2) to the data and examine the t-table
astsa::sarima(x, p=2, d=0, q=0)
## initial value 1.135284
## iter 2 value 1.000955
## iter 3 value 0.573050
## iter 4 value 0.319593
## iter 5 value 0.112702
## iter 6 value 0.013660
## iter 7 value 0.002697
## iter 8 value -0.000025
## iter 9 value -0.000091
## iter 10 value -0.000199
## iter 11 value -0.000317
## iter 12 value -0.000318
## iter 13 value -0.000318
## iter 14 value -0.000318
## iter 14 value -0.000318
## iter 14 value -0.000318
## final value -0.000318
## converged
## initial value 0.006655
## iter 2 value 0.006642
## iter 3 value 0.006631
## iter 4 value 0.006630
## iter 5 value 0.006630
## iter 6 value 0.006630
## iter 7 value 0.006630
## iter 7 value 0.006630
## iter 7 value 0.006630
## final value 0.006630
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ar2 xmean
## 1.5319 -0.7745 -0.1679
## s.e. 0.0449 0.0451 0.2912
##
## sigma^2 estimated as 0.9973: log likelihood = -285.11, aic = 578.23
##
## $degrees_of_freedom
## [1] 197
##
## $ttable
## Estimate SE t.value p.value
## ar1 1.5319 0.0449 34.1410 0.0000
## ar2 -0.7745 0.0451 -17.1766 0.0000
## xmean -0.1679 0.2912 -0.5766 0.5649
##
## $AIC
## [1] 1.027264
##
## $AICc
## [1] 1.038289
##
## $BIC
## [1] 0.07673852
x <- arima.sim(model = list(order = c(0, 0, 1), ma = -.8), n = 100)
# Plot x
plot(x)
# Plot the sample P/ACF of x
astsa::acf2(x)
## ACF PACF
## [1,] -0.53 -0.53
## [2,] -0.02 -0.42
## [3,] 0.22 -0.03
## [4,] -0.20 -0.10
## [5,] 0.01 -0.15
## [6,] 0.04 -0.17
## [7,] -0.08 -0.22
## [8,] 0.08 -0.13
## [9,] -0.03 -0.12
## [10,] 0.01 -0.08
## [11,] 0.14 0.16
## [12,] -0.22 -0.05
## [13,] 0.17 0.06
## [14,] -0.09 -0.06
## [15,] -0.01 0.02
## [16,] -0.05 -0.18
## [17,] 0.04 -0.15
## [18,] -0.01 -0.17
## [19,] 0.14 0.13
## [20,] -0.23 -0.19
# Fit an MA(1) to the data and examine the t-table
astsa::sarima(x, p=0, d=0, q=1)
## initial value 0.180845
## iter 2 value -0.040665
## iter 3 value -0.075337
## iter 4 value -0.084417
## iter 5 value -0.103906
## iter 6 value -0.108263
## iter 7 value -0.111058
## iter 8 value -0.111196
## iter 9 value -0.111839
## iter 10 value -0.112343
## iter 11 value -0.112403
## iter 12 value -0.112403
## iter 13 value -0.112412
## iter 14 value -0.112412
## iter 14 value -0.112412
## iter 14 value -0.112412
## final value -0.112412
## converged
## initial value -0.105937
## iter 2 value -0.106122
## iter 3 value -0.106222
## iter 4 value -0.106227
## iter 5 value -0.106228
## iter 5 value -0.106228
## iter 5 value -0.106228
## final value -0.106228
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 xmean
## -0.9236 -0.0341
## s.e. 0.0917 0.0080
##
## sigma^2 estimated as 0.7932: log likelihood = -131.27, aic = 268.54
##
## $degrees_of_freedom
## [1] 98
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.9236 0.0917 -10.0697 0
## xmean -0.0341 0.0080 -4.2614 0
##
## $AIC
## [1] 0.808372
##
## $AICc
## [1] 0.830872
##
## $BIC
## [1] -0.1395246
x <- arima.sim(model = list(order = c(2, 0, 1), ar = c(1, -.9), ma = .8), n = 250)
# Plot x
plot(x)
# Plot the sample P/ACF of x
astsa::acf2(x)
## ACF PACF
## [1,] 0.54 0.54
## [2,] -0.38 -0.94
## [3,] -0.86 0.38
## [4,] -0.51 -0.27
## [5,] 0.27 0.21
## [6,] 0.73 -0.23
## [7,] 0.47 0.12
## [8,] -0.20 -0.12
## [9,] -0.63 0.05
## [10,] -0.44 0.00
## [11,] 0.15 -0.05
## [12,] 0.54 0.05
## [13,] 0.41 0.05
## [14,] -0.08 -0.02
## [15,] -0.45 0.09
## [16,] -0.37 -0.08
## [17,] 0.03 -0.05
## [18,] 0.34 -0.02
## [19,] 0.30 0.10
## [20,] 0.01 -0.03
## [21,] -0.24 -0.04
## [22,] -0.25 -0.08
## [23,] -0.06 -0.03
## [24,] 0.14 -0.05
## [25,] 0.18 0.00
## [26,] 0.07 -0.11
# Fit an ARMA(2,1) to the data and examine the t-table
astsa::sarima(x, p=2, d=0, q=1)
## initial value 1.450426
## iter 2 value 0.635360
## iter 3 value 0.398344
## iter 4 value 0.219155
## iter 5 value 0.077040
## iter 6 value 0.064600
## iter 7 value 0.059515
## iter 8 value 0.055589
## iter 9 value 0.055489
## iter 10 value 0.055488
## iter 11 value 0.055488
## iter 12 value 0.055488
## iter 13 value 0.055488
## iter 13 value 0.055488
## iter 13 value 0.055488
## final value 0.055488
## converged
## initial value 0.058943
## iter 2 value 0.058942
## iter 3 value 0.058633
## iter 4 value 0.058610
## iter 5 value 0.058563
## iter 6 value 0.058532
## iter 7 value 0.058521
## iter 8 value 0.058520
## iter 9 value 0.058520
## iter 9 value 0.058520
## iter 9 value 0.058520
## final value 0.058520
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ar2 ma1 xmean
## 0.9637 -0.8879 0.7885 -0.1368
## s.e. 0.0285 0.0278 0.0424 0.1284
##
## sigma^2 estimated as 1.097: log likelihood = -369.36, aic = 748.73
##
## $degrees_of_freedom
## [1] 246
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.9637 0.0285 33.8291 0.0000
## ar2 -0.8879 0.0278 -31.9071 0.0000
## ma1 0.7885 0.0424 18.5994 0.0000
## xmean -0.1368 0.1284 -1.0656 0.2877
##
## $AIC
## [1] 1.124816
##
## $AICc
## [1] 1.133799
##
## $BIC
## [1] 0.1811589
data (varve, package="astsa")
dl_varve <- diff(log(varve))
# Fit an MA(1) to dl_varve.
astsa::sarima(dl_varve, p=0, d=0, q=1)
## initial value -0.551780
## iter 2 value -0.671633
## iter 3 value -0.706234
## iter 4 value -0.707586
## iter 5 value -0.718543
## iter 6 value -0.719692
## iter 7 value -0.721967
## iter 8 value -0.722970
## iter 9 value -0.723231
## iter 10 value -0.723247
## iter 11 value -0.723248
## iter 12 value -0.723248
## iter 12 value -0.723248
## iter 12 value -0.723248
## final value -0.723248
## converged
## initial value -0.722762
## iter 2 value -0.722764
## iter 3 value -0.722764
## iter 4 value -0.722765
## iter 4 value -0.722765
## iter 4 value -0.722765
## final value -0.722765
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 xmean
## -0.7710 -0.0013
## s.e. 0.0341 0.0044
##
## sigma^2 estimated as 0.2353: log likelihood = -440.68, aic = 887.36
##
## $degrees_of_freedom
## [1] 631
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.7710 0.0341 -22.6002 0.0000
## xmean -0.0013 0.0044 -0.2818 0.7782
##
## $AIC
## [1] -0.4406366
##
## $AICc
## [1] -0.4374168
##
## $BIC
## [1] -1.426575
# Fit an MA(2) to dl_varve. Improvement?
astsa::sarima(dl_varve, p=0, d=0, q=2)
## initial value -0.551780
## iter 2 value -0.679736
## iter 3 value -0.728605
## iter 4 value -0.734640
## iter 5 value -0.735449
## iter 6 value -0.735979
## iter 7 value -0.736015
## iter 8 value -0.736059
## iter 9 value -0.736060
## iter 10 value -0.736060
## iter 11 value -0.736061
## iter 12 value -0.736061
## iter 12 value -0.736061
## iter 12 value -0.736061
## final value -0.736061
## converged
## initial value -0.735372
## iter 2 value -0.735378
## iter 3 value -0.735379
## iter 4 value -0.735379
## iter 4 value -0.735379
## iter 4 value -0.735379
## final value -0.735379
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 ma2 xmean
## -0.6710 -0.1595 -0.0013
## s.e. 0.0375 0.0392 0.0033
##
## sigma^2 estimated as 0.2294: log likelihood = -432.69, aic = 873.39
##
## $degrees_of_freedom
## [1] 630
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.6710 0.0375 -17.9057 0.0000
## ma2 -0.1595 0.0392 -4.0667 0.0001
## xmean -0.0013 0.0033 -0.4007 0.6888
##
## $AIC
## [1] -0.4629629
##
## $AICc
## [1] -0.4597027
##
## $BIC
## [1] -1.441871
# Fit an ARMA(1,1) to dl_varve. Improvement?
astsa::sarima(dl_varve, p=1, d=0, q=1)
## initial value -0.550994
## iter 2 value -0.648962
## iter 3 value -0.676965
## iter 4 value -0.699167
## iter 5 value -0.724554
## iter 6 value -0.726719
## iter 7 value -0.729066
## iter 8 value -0.731976
## iter 9 value -0.734235
## iter 10 value -0.735969
## iter 11 value -0.736410
## iter 12 value -0.737045
## iter 13 value -0.737600
## iter 14 value -0.737641
## iter 15 value -0.737643
## iter 16 value -0.737643
## iter 17 value -0.737643
## iter 18 value -0.737643
## iter 18 value -0.737643
## iter 18 value -0.737643
## final value -0.737643
## converged
## initial value -0.737522
## iter 2 value -0.737527
## iter 3 value -0.737528
## iter 4 value -0.737529
## iter 5 value -0.737530
## iter 5 value -0.737530
## iter 5 value -0.737530
## final value -0.737530
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ma1 xmean
## 0.2341 -0.8871 -0.0013
## s.e. 0.0518 0.0292 0.0028
##
## sigma^2 estimated as 0.2284: log likelihood = -431.33, aic = 870.66
##
## $degrees_of_freedom
## [1] 630
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.2341 0.0518 4.5184 0.0000
## ma1 -0.8871 0.0292 -30.4107 0.0000
## xmean -0.0013 0.0028 -0.4618 0.6444
##
## $AIC
## [1] -0.467376
##
## $AICc
## [1] -0.4641159
##
## $BIC
## [1] -1.446284
# Fit an MA(1) to dl_varve. Examine the residuals
astsa::sarima(dl_varve, p=0, d=0, q=1)
## initial value -0.551780
## iter 2 value -0.671633
## iter 3 value -0.706234
## iter 4 value -0.707586
## iter 5 value -0.718543
## iter 6 value -0.719692
## iter 7 value -0.721967
## iter 8 value -0.722970
## iter 9 value -0.723231
## iter 10 value -0.723247
## iter 11 value -0.723248
## iter 12 value -0.723248
## iter 12 value -0.723248
## iter 12 value -0.723248
## final value -0.723248
## converged
## initial value -0.722762
## iter 2 value -0.722764
## iter 3 value -0.722764
## iter 4 value -0.722765
## iter 4 value -0.722765
## iter 4 value -0.722765
## final value -0.722765
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 xmean
## -0.7710 -0.0013
## s.e. 0.0341 0.0044
##
## sigma^2 estimated as 0.2353: log likelihood = -440.68, aic = 887.36
##
## $degrees_of_freedom
## [1] 631
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.7710 0.0341 -22.6002 0.0000
## xmean -0.0013 0.0044 -0.2818 0.7782
##
## $AIC
## [1] -0.4406366
##
## $AICc
## [1] -0.4374168
##
## $BIC
## [1] -1.426575
# Fit an ARMA(1,1) to dl_varve. Examine the residuals
astsa::sarima(dl_varve, p=1, d=0, q=1)
## initial value -0.550994
## iter 2 value -0.648962
## iter 3 value -0.676965
## iter 4 value -0.699167
## iter 5 value -0.724554
## iter 6 value -0.726719
## iter 7 value -0.729066
## iter 8 value -0.731976
## iter 9 value -0.734235
## iter 10 value -0.735969
## iter 11 value -0.736410
## iter 12 value -0.737045
## iter 13 value -0.737600
## iter 14 value -0.737641
## iter 15 value -0.737643
## iter 16 value -0.737643
## iter 17 value -0.737643
## iter 18 value -0.737643
## iter 18 value -0.737643
## iter 18 value -0.737643
## final value -0.737643
## converged
## initial value -0.737522
## iter 2 value -0.737527
## iter 3 value -0.737528
## iter 4 value -0.737529
## iter 5 value -0.737530
## iter 5 value -0.737530
## iter 5 value -0.737530
## final value -0.737530
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ma1 xmean
## 0.2341 -0.8871 -0.0013
## s.e. 0.0518 0.0292 0.0028
##
## sigma^2 estimated as 0.2284: log likelihood = -431.33, aic = 870.66
##
## $degrees_of_freedom
## [1] 630
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.2341 0.0518 4.5184 0.0000
## ma1 -0.8871 0.0292 -30.4107 0.0000
## xmean -0.0013 0.0028 -0.4618 0.6444
##
## $AIC
## [1] -0.467376
##
## $AICc
## [1] -0.4641159
##
## $BIC
## [1] -1.446284
data(oil, package="astsa")
# Calculate approximate oil returns
oil_returns <- diff(log(oil))
# Plot oil_returns. Notice the outliers.
plot(oil_returns)
# Plot the P/ACF pair for oil_returns
astsa::acf2(oil_returns)
## ACF PACF
## [1,] 0.13 0.13
## [2,] -0.07 -0.09
## [3,] 0.13 0.16
## [4,] -0.01 -0.06
## [5,] 0.02 0.05
## [6,] -0.03 -0.08
## [7,] -0.03 0.00
## [8,] 0.13 0.12
## [9,] 0.08 0.05
## [10,] 0.02 0.03
## [11,] 0.01 -0.02
## [12,] 0.00 0.00
## [13,] -0.02 -0.03
## [14,] 0.06 0.09
## [15,] -0.05 -0.07
## [16,] -0.09 -0.06
## [17,] 0.03 0.01
## [18,] 0.05 0.04
## [19,] -0.05 -0.05
## [20,] -0.07 -0.05
## [21,] 0.04 0.05
## [22,] 0.09 0.06
## [23,] -0.05 -0.06
## [24,] -0.08 -0.05
## [25,] -0.07 -0.08
## [26,] 0.00 0.02
## [27,] -0.11 -0.11
## [28,] -0.07 0.01
## [29,] 0.02 0.00
## [30,] -0.02 -0.01
## [31,] -0.03 -0.05
## [32,] -0.05 -0.04
## [33,] -0.03 0.02
## [34,] 0.00 0.02
# Assuming both P/ACF are tailing, fit a model to oil_returns
astsa::sarima(oil_returns, p=1, d=0, q=1)
## initial value -3.057594
## iter 2 value -3.061420
## iter 3 value -3.067360
## iter 4 value -3.067479
## iter 5 value -3.071834
## iter 6 value -3.074359
## iter 7 value -3.074843
## iter 8 value -3.076656
## iter 9 value -3.080467
## iter 10 value -3.081546
## iter 11 value -3.081603
## iter 12 value -3.081615
## iter 13 value -3.081642
## iter 14 value -3.081643
## iter 14 value -3.081643
## iter 14 value -3.081643
## final value -3.081643
## converged
## initial value -3.082345
## iter 2 value -3.082345
## iter 3 value -3.082346
## iter 4 value -3.082346
## iter 5 value -3.082346
## iter 5 value -3.082346
## iter 5 value -3.082346
## final value -3.082346
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ma1 xmean
## -0.5264 0.7146 0.0018
## s.e. 0.0871 0.0683 0.0022
##
## sigma^2 estimated as 0.002102: log likelihood = 904.89, aic = -1801.79
##
## $degrees_of_freedom
## [1] 541
##
## $ttable
## Estimate SE t.value p.value
## ar1 -0.5264 0.0871 -6.0422 0.0000
## ma1 0.7146 0.0683 10.4699 0.0000
## xmean 0.0018 0.0022 0.7981 0.4252
##
## $AIC
## [1] -5.153838
##
## $AICc
## [1] -5.150025
##
## $BIC
## [1] -6.130131
Chapter 3 - ARIMA Models
ARIMA - Integrated ARMA fitted to non-stationary time series:
ARIMA Diagnostics - typical concerns about overfitting:
Forecasting ARIMA - the model describes the dynamics, which can be applied in to the future:
Example code includes:
x <- arima.sim(model = list(order = c(1, 1, 0), ar = .9), n = 200)
# Plot x
plot(x)
# Plot the P/ACF pair of x
astsa::acf2(x)
## ACF PACF
## [1,] 0.99 0.99
## [2,] 0.99 -0.09
## [3,] 0.98 -0.07
## [4,] 0.97 -0.06
## [5,] 0.96 -0.04
## [6,] 0.95 -0.06
## [7,] 0.94 -0.05
## [8,] 0.93 -0.04
## [9,] 0.91 -0.04
## [10,] 0.90 -0.03
## [11,] 0.89 -0.04
## [12,] 0.87 -0.03
## [13,] 0.86 -0.01
## [14,] 0.84 -0.03
## [15,] 0.82 -0.02
## [16,] 0.81 -0.05
## [17,] 0.79 -0.05
## [18,] 0.77 -0.03
## [19,] 0.76 -0.03
## [20,] 0.74 -0.03
## [21,] 0.72 -0.03
## [22,] 0.70 -0.02
## [23,] 0.68 -0.02
## [24,] 0.66 -0.02
## [25,] 0.64 -0.01
# Plot the differenced data
plot(diff(x))
# Plot the P/ACF pair of the differenced data
astsa::acf2(diff(x))
## ACF PACF
## [1,] 0.84 0.84
## [2,] 0.70 -0.02
## [3,] 0.58 0.00
## [4,] 0.48 -0.04
## [5,] 0.44 0.16
## [6,] 0.39 -0.01
## [7,] 0.35 -0.01
## [8,] 0.32 0.04
## [9,] 0.28 -0.04
## [10,] 0.22 -0.09
## [11,] 0.17 0.02
## [12,] 0.16 0.09
## [13,] 0.21 0.18
## [14,] 0.26 0.08
## [15,] 0.32 0.10
## [16,] 0.35 -0.01
## [17,] 0.31 -0.11
## [18,] 0.27 -0.04
## [19,] 0.23 -0.03
## [20,] 0.21 0.05
## [21,] 0.18 -0.12
## [22,] 0.16 0.03
## [23,] 0.13 -0.07
## [24,] 0.08 -0.08
## [25,] 0.03 0.02
xData <- c( 2.071, 4.75, 6.674, 5.908, 3.886, 1.797, 0.649, 0.944, 1.755, 0.949, -0.321, -2.235, -4.472, -5.33, -3.556, 0.183, 6.393, 13.8, 20.431, 23.98, 24.522, 23.907, 23.27, 22.19, 20.059, 18.234, 17.08, 18.352, 21.234, 22.34, 21.248, 20.583, 19.799, 18.604, 19.393, 20.45, 21.861, 24.772, 29.022, 33.568, 38.256, 41.102, 42.96, 44.971, 47.002, 47.558, 47.397, 47.664, 47.592, 46.829, 46.66, 47.851, 51.184, 55.756, 60.053, 65.424, 71.336, 75.162, 77.131, 77.535, 76.534, 75.268, 74.917, 74.917, 74.447, 73.814, 71.874, 70.049, 68.571, 69.212, 72.331, 77.285, 82.489, 88.604, 94.093, 97.054, 99.208, 99.862, 100.939, 101.231, 101.496, 102.408, 103.906, 107.007, 111.464, 115.662, 119.608, 123.482, 125.956, 126.39, 126.386, 125.913, 125.488, 125.576, 126.291, 127.143, 127.52, 126.081, 124.965, 123.745, 122.581, 121.929, 123.325, 126.775, 132.555, 139.235, 144.934, 149.721, 154.382, 157.019, 157.206, 154.616, 148.832, 141.499, 135.467, 131.852, 132.204, 136.506, 142.587, 148.555, 150.681, 148.482, 142.889, 136.895, 131.35, 128.87, 127.53, 128.324, 131.564, 136.374, 142.986, 150.038, 155.446, 159.031, 159.776, 157.518, 155.821, 156.742, 159.896, 162.664, 164.717, 166.054, 164.365, 160.334, 153.985, 148.808, 146.378, 145.179, 145.683, 148.118, 152.318, 158.13, 164.868, 171.405, 177.053, 182.439, 186.528, 189.036, 191.453, 193.507, 196.097, 198.629, 200.216, 200.839, 201.791, 201.882, 201.844, 201.766, 204.88, 208.738, 212.117, 214.878, 218.935, 223.003, 227.042, 228.179, 227.576, 227.183, 227.895, 229.689, 232.106, 234.707, 234.405, 232.747, 232.052, 234.176, 237.706, 243.079, 247.933, 249.965, 251.077, 250.945, 250.302, 248.648, 248.404, 250.725, 255.209, 260.453, 264.559, 268.147, 269.122, 267.308, 262.819, 258.705, 255.487, 253.049, 251.807, 251.932, 253.196, 256.489, 259.875, 263.342, 266.208, 266.414, 265.439, 264.196, 264.413, 266.275, 270.239, 276.725, 283.784, 289.445, 292.879, 293.287, 292.272, 290.836, 288.097, 285.868, 283.051, 281.694, 281.11, 281.1, 282.375, 284.273, 286.304, 290.172, 296.595, 303.989, 310.565, 315.547, 317.702, 317.364, 313.184, 306.788, 300.193, 295.649, 293.628, 296.013, 301.313, 306.754 )
x <- ts(data=xData, frequency=1, start=c(1, 1))
str(x)
## Time-Series [1:250] from 1 to 250: 2.07 4.75 6.67 5.91 3.89 ...
y <- diff(x)
# Plot sample P/ACF of differenced data and determine model
astsa::acf2(diff(x))
## ACF PACF
## [1,] 0.86 0.86
## [2,] 0.53 -0.75
## [3,] 0.15 -0.04
## [4,] -0.18 -0.05
## [5,] -0.41 -0.04
## [6,] -0.49 0.03
## [7,] -0.45 -0.09
## [8,] -0.32 0.01
## [9,] -0.17 -0.10
## [10,] -0.04 -0.08
## [11,] 0.05 0.02
## [12,] 0.08 -0.10
## [13,] 0.06 -0.09
## [14,] 0.00 -0.03
## [15,] -0.04 0.04
## [16,] -0.07 -0.07
## [17,] -0.06 0.03
## [18,] -0.01 0.07
## [19,] 0.06 0.01
## [20,] 0.14 0.06
## [21,] 0.20 -0.08
## [22,] 0.20 0.01
## [23,] 0.15 -0.02
## [24,] 0.07 0.06
## [25,] -0.02 0.04
## [26,] -0.07 0.03
# Estimate parameters and examine output
astsa::sarima(x, p=2, d=1, q=0)
## initial value 1.127641
## iter 2 value 0.983533
## iter 3 value 0.570293
## iter 4 value 0.314868
## iter 5 value 0.100372
## iter 6 value 0.063137
## iter 7 value 0.007514
## iter 8 value 0.005891
## iter 9 value 0.005789
## iter 10 value 0.005620
## iter 11 value 0.005527
## iter 12 value 0.005526
## iter 13 value 0.005526
## iter 13 value 0.005526
## iter 13 value 0.005526
## final value 0.005526
## converged
## initial value 0.008531
## iter 2 value 0.008509
## iter 3 value 0.008495
## iter 4 value 0.008495
## iter 5 value 0.008495
## iter 6 value 0.008495
## iter 6 value 0.008495
## iter 6 value 0.008495
## final value 0.008495
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ar2 constant
## 1.5197 -0.7669 1.2335
## s.e. 0.0401 0.0401 0.2570
##
## sigma^2 estimated as 1.004: log likelihood = -355.43, aic = 718.86
##
## $degrees_of_freedom
## [1] 247
##
## $ttable
## Estimate SE t.value p.value
## ar1 1.5197 0.0401 37.9154 0
## ar2 -0.7669 0.0401 -19.1298 0
## constant 1.2335 0.2570 4.7992 0
##
## $AIC
## [1] 1.028458
##
## $AICc
## [1] 1.037112
##
## $BIC
## [1] 0.07071602
data(globtemp, package="astsa")
# Plot the sample P/ACF pair of the differenced data
astsa::acf2(diff(globtemp))
## ACF PACF
## [1,] -0.24 -0.24
## [2,] -0.19 -0.26
## [3,] -0.08 -0.23
## [4,] 0.20 0.06
## [5,] -0.15 -0.16
## [6,] -0.03 -0.09
## [7,] 0.03 -0.05
## [8,] 0.14 0.07
## [9,] -0.16 -0.09
## [10,] 0.11 0.11
## [11,] -0.05 -0.03
## [12,] 0.00 -0.02
## [13,] -0.13 -0.10
## [14,] 0.14 0.02
## [15,] -0.01 0.00
## [16,] -0.08 -0.09
## [17,] 0.00 0.00
## [18,] 0.19 0.11
## [19,] -0.07 0.04
## [20,] 0.02 0.13
## [21,] -0.02 0.09
## [22,] 0.08 0.08
# Fit an ARIMA(1,1,1) model to globtemp
astsa::sarima(globtemp, p=1, d=1, q=1)
## initial value -2.218917
## iter 2 value -2.253118
## iter 3 value -2.263750
## iter 4 value -2.272144
## iter 5 value -2.282786
## iter 6 value -2.296777
## iter 7 value -2.297062
## iter 8 value -2.297253
## iter 9 value -2.297389
## iter 10 value -2.297405
## iter 11 value -2.297413
## iter 12 value -2.297413
## iter 13 value -2.297414
## iter 13 value -2.297414
## iter 13 value -2.297414
## final value -2.297414
## converged
## initial value -2.305504
## iter 2 value -2.305800
## iter 3 value -2.305821
## iter 4 value -2.306655
## iter 5 value -2.306875
## iter 6 value -2.306950
## iter 7 value -2.306955
## iter 8 value -2.306955
## iter 8 value -2.306955
## final value -2.306955
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ma1 constant
## 0.3549 -0.7663 0.0072
## s.e. 0.1314 0.0874 0.0032
##
## sigma^2 estimated as 0.009885: log likelihood = 119.88, aic = -231.76
##
## $degrees_of_freedom
## [1] 133
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.3549 0.1314 2.7008 0.0078
## ma1 -0.7663 0.0874 -8.7701 0.0000
## constant 0.0072 0.0032 2.2738 0.0246
##
## $AIC
## [1] -3.572642
##
## $AICc
## [1] -3.555691
##
## $BIC
## [1] -4.508392
# Fit an ARIMA(0,1,2) model to globtemp. Which model is better?
astsa::sarima(globtemp, p=0, d=1, q=2)
## initial value -2.220513
## iter 2 value -2.294887
## iter 3 value -2.307682
## iter 4 value -2.309170
## iter 5 value -2.310360
## iter 6 value -2.311251
## iter 7 value -2.311636
## iter 8 value -2.311648
## iter 9 value -2.311649
## iter 9 value -2.311649
## iter 9 value -2.311649
## final value -2.311649
## converged
## initial value -2.310187
## iter 2 value -2.310197
## iter 3 value -2.310199
## iter 4 value -2.310201
## iter 5 value -2.310202
## iter 5 value -2.310202
## iter 5 value -2.310202
## final value -2.310202
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ma1 ma2 constant
## -0.3984 -0.2173 0.0072
## s.e. 0.0808 0.0768 0.0033
##
## sigma^2 estimated as 0.00982: log likelihood = 120.32, aic = -232.64
##
## $degrees_of_freedom
## [1] 133
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.3984 0.0808 -4.9313 0.0000
## ma2 -0.2173 0.0768 -2.8303 0.0054
## constant 0.0072 0.0033 2.1463 0.0337
##
## $AIC
## [1] -3.579224
##
## $AICc
## [1] -3.562273
##
## $BIC
## [1] -4.514974
xData <- c( -0.0751, 0.1473, 1.8112, 4.8931, 7.0292, 8.1352, 9.0227, 10.3904, 11.9989, 11.4527, 11.2707, 12.5312, 12.1963, 10.7977, 12.0651, 13.5885, 12.4802, 11.709, 10.9356, 12.3663, 14.3876, 14.2129, 13.5661, 12.9155, 13.4154, 14.9105, 16.2552, 16.7393, 17.1447, 18.0555, 19.7376, 22.5407, 24.7367, 24.8413, 24.2488, 24.2967, 24.2308, 23.8902, 23.7027, 23.119, 22.7335, 22.9657, 23.8808, 24.4345, 24.2466, 23.4257, 20.8514, 19.4998, 19.9398, 20.2972, 20.7262, 20.1964, 17.5082, 15.9907, 15.4264, 14.1124, 14.4446, 16.3402, 17.577, 19.4557, 21.6471, 22.1894, 21.0641, 20.0541, 21.0169, 22.3758, 21.9696, 20.0109, 19.2389, 19.2861, 20.4638, 21.5998, 18.9907, 15.9218, 16.751, 17.3235, 15.8171, 16.9022, 17.2296, 16.2838, 17.8028, 19.7293, 20.4888, 21.4197, 21.1516, 21.1138, 23.0237, 24.211, 23.1522, 22.3539, 23.3107, 23.1071, 21.6763, 21.7444, 23.002, 24.7646, 26.0639, 25.9787, 27.8355, 30.5886, 30.1021, 29.4103, 29.8847, 29.5996, 29.5772, 30.4156, 30.2665, 28.7099, 27.6781, 25.9568, 24.9156, 24.8254, 25.6952, 27.641, 28.8981, 29.2489, 30.9297, 32.5278, 31.5972, 32.3645, 33.2106, 34.1595, 34.4231, 33.8642, 34.7263, 35.2714, 36.6619, 38.5322, 38.7635, 39.1658, 40.7182, 40.891, 39.7363, 40.1594, 40.6549, 40.3654, 40.5468, 40.7007, 40.3408, 39.3942, 37.2571, 36.9096, 37.0338, 35.8572, 35.4378, 36.6571, 38.4328, 40.4212, 42.0617, 42.1701, 42.9875, 45.4235, 45.7948, 44.3909, 42.8091, 39.8039, 37.1785, 36.8238, 36.8816, 37.6287, 39.3721, 39.7785, 39.3112, 36.6673, 33.274, 31.3097, 30.9826, 30.462, 30.6871, 29.6729, 28.5721, 30.0226, 31.0649, 32.9386, 34.8814, 34.8945, 35.0234, 34.6894, 33.0402, 34.2274, 37.5808, 39.2334, 37.9677, 36.6451, 36.7756, 34.4778, 31.6004, 29.1428, 28.61, 29.9308, 28.5681, 27.3121, 28.0795, 29.2628, 30.9914, 32.9232, 34.3216, 35.4834, 37.6638, 39.102, 39.2936, 40.9448, 42.3607, 43.5172, 44.4513, 43.9077, 43.3648, 44.2566, 44.0296, 43.3438, 43.433, 46.2347, 47.8019, 46.502, 46.5795, 49.1136, 50.928, 51.5114, 50.0802, 48.6748, 50.2435, 51.8771, 52.6298, 52.8352, 52.9461, 50.4009, 48.5522, 50.3446, 53.2334, 54.3444, 55.4121, 55.9148, 53.7499, 53.9132, 54.7285, 54.4254, 53.5442, 54.1458, 56.728, 58.4062, 58.9589, 58.3515, 58.9129, 58.3679, 56.145, 54.1373, 54.0196, 54.2961, 52.784, 51.715 )
x <- ts(data=xData, frequency=1, start=c(1, 1))
str(x)
## Time-Series [1:250] from 1 to 250: -0.0751 0.1473 1.8112 4.8931 7.0292 ...
# Plot sample P/ACF pair of the differenced data
astsa::acf2(diff(x))
## ACF PACF
## [1,] 0.49 0.49
## [2,] -0.02 -0.34
## [3,] 0.02 0.29
## [4,] 0.03 -0.23
## [5,] -0.01 0.18
## [6,] -0.02 -0.17
## [7,] -0.09 0.01
## [8,] -0.07 -0.01
## [9,] -0.02 -0.03
## [10,] -0.10 -0.13
## [11,] -0.10 0.09
## [12,] 0.00 -0.08
## [13,] -0.03 0.00
## [14,] -0.10 -0.11
## [15,] -0.07 0.05
## [16,] -0.03 -0.06
## [17,] 0.01 0.07
## [18,] 0.02 -0.07
## [19,] -0.02 0.02
## [20,] 0.00 0.02
## [21,] 0.10 0.08
## [22,] 0.15 0.09
## [23,] 0.12 -0.02
## [24,] 0.01 -0.06
## [25,] -0.04 0.01
## [26,] 0.02 0.03
# Fit the first model, compare parameters, check diagnostics
astsa::sarima(x, p=0, d=1, q=1)
## initial value 0.282663
## iter 2 value 0.086381
## iter 3 value 0.013882
## iter 4 value -0.019189
## iter 5 value -0.020178
## iter 6 value -0.020411
## iter 7 value -0.020429
## iter 8 value -0.020430
## iter 9 value -0.020431
## iter 10 value -0.020431
## iter 11 value -0.020431
## iter 12 value -0.020431
## iter 12 value -0.020431
## iter 12 value -0.020431
## final value -0.020431
## converged
## initial value -0.016992
## iter 2 value -0.017046
## iter 3 value -0.017049
## iter 4 value -0.017050
## iter 5 value -0.017050
## iter 6 value -0.017050
## iter 7 value -0.017050
## iter 8 value -0.017050
## iter 8 value -0.017050
## iter 8 value -0.017050
## final value -0.017050
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ma1 constant
## 0.9065 0.2063
## s.e. 0.0323 0.1181
##
## sigma^2 estimated as 0.9598: log likelihood = -349.07, aic = 704.14
##
## $degrees_of_freedom
## [1] 248
##
## $ttable
## Estimate SE t.value p.value
## ma1 0.9065 0.0323 28.0497 0.0000
## constant 0.2063 0.1181 1.7459 0.0821
##
## $AIC
## [1] 0.9749726
##
## $AICc
## [1] 0.9833628
##
## $BIC
## [1] 0.003144257
# Fit the second model and compare fit
astsa::sarima(x, p=0, d=1, q=2)
## initial value 0.282663
## iter 2 value 0.082436
## iter 3 value 0.052466
## iter 4 value -0.014265
## iter 5 value -0.018249
## iter 6 value -0.019318
## iter 7 value -0.020294
## iter 8 value -0.020432
## iter 9 value -0.020432
## iter 10 value -0.020433
## iter 11 value -0.020433
## iter 12 value -0.020433
## iter 13 value -0.020433
## iter 13 value -0.020433
## iter 13 value -0.020433
## final value -0.020433
## converged
## initial value -0.016998
## iter 2 value -0.017045
## iter 3 value -0.017056
## iter 4 value -0.017057
## iter 5 value -0.017058
## iter 6 value -0.017058
## iter 7 value -0.017058
## iter 8 value -0.017058
## iter 8 value -0.017058
## final value -0.017058
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ma1 ma2 constant
## 0.9099 0.0041 0.2063
## s.e. 0.0651 0.0684 0.1186
##
## sigma^2 estimated as 0.9598: log likelihood = -349.07, aic = 706.14
##
## $degrees_of_freedom
## [1] 247
##
## $ttable
## Estimate SE t.value p.value
## ma1 0.9099 0.0651 13.9821 0.0000
## ma2 0.0041 0.0684 0.0602 0.9521
## constant 0.2063 0.1186 1.7391 0.0833
##
## $AIC
## [1] 0.9829715
##
## $AICc
## [1] 0.9916246
##
## $BIC
## [1] 0.02522905
# Fit ARIMA(0,1,2) to globtemp and check diagnostics
astsa::sarima(globtemp, p=0, d=1, q=2)
## initial value -2.220513
## iter 2 value -2.294887
## iter 3 value -2.307682
## iter 4 value -2.309170
## iter 5 value -2.310360
## iter 6 value -2.311251
## iter 7 value -2.311636
## iter 8 value -2.311648
## iter 9 value -2.311649
## iter 9 value -2.311649
## iter 9 value -2.311649
## final value -2.311649
## converged
## initial value -2.310187
## iter 2 value -2.310197
## iter 3 value -2.310199
## iter 4 value -2.310201
## iter 5 value -2.310202
## iter 5 value -2.310202
## iter 5 value -2.310202
## final value -2.310202
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ma1 ma2 constant
## -0.3984 -0.2173 0.0072
## s.e. 0.0808 0.0768 0.0033
##
## sigma^2 estimated as 0.00982: log likelihood = 120.32, aic = -232.64
##
## $degrees_of_freedom
## [1] 133
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.3984 0.0808 -4.9313 0.0000
## ma2 -0.2173 0.0768 -2.8303 0.0054
## constant 0.0072 0.0033 2.1463 0.0337
##
## $AIC
## [1] -3.579224
##
## $AICc
## [1] -3.562273
##
## $BIC
## [1] -4.514974
# Fit ARIMA(1,1,1) to globtemp and check diagnostics
astsa::sarima(globtemp, p=1, d=1, q=1)
## initial value -2.218917
## iter 2 value -2.253118
## iter 3 value -2.263750
## iter 4 value -2.272144
## iter 5 value -2.282786
## iter 6 value -2.296777
## iter 7 value -2.297062
## iter 8 value -2.297253
## iter 9 value -2.297389
## iter 10 value -2.297405
## iter 11 value -2.297413
## iter 12 value -2.297413
## iter 13 value -2.297414
## iter 13 value -2.297414
## iter 13 value -2.297414
## final value -2.297414
## converged
## initial value -2.305504
## iter 2 value -2.305800
## iter 3 value -2.305821
## iter 4 value -2.306655
## iter 5 value -2.306875
## iter 6 value -2.306950
## iter 7 value -2.306955
## iter 8 value -2.306955
## iter 8 value -2.306955
## final value -2.306955
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ma1 constant
## 0.3549 -0.7663 0.0072
## s.e. 0.1314 0.0874 0.0032
##
## sigma^2 estimated as 0.009885: log likelihood = 119.88, aic = -231.76
##
## $degrees_of_freedom
## [1] 133
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.3549 0.1314 2.7008 0.0078
## ma1 -0.7663 0.0874 -8.7701 0.0000
## constant 0.0072 0.0032 2.2738 0.0246
##
## $AIC
## [1] -3.572642
##
## $AICc
## [1] -3.555691
##
## $BIC
## [1] -4.508392
yData <- c( 1.475, 3.061, 6.53, 9.844, 15.735, 20.798, 24.635, 27.322, 28.793, 30.4, 31.672, 32.209, 33.255, 35.53, 35.87, 35.65, 35.766, 34.509, 32.438, 30.804, 30.913, 29.845, 28.667, 27.555, 26.962, 26.649, 28.018, 30.804, 34.625, 38.363, 41.745, 46.059, 51.431, 56.778, 61.529, 65.51, 69.054, 70.332, 72.318, 73.341, 74.756, 77.632, 78.618, 78.419, 78.412, 80.362, 82.771, 84.24, 86.619, 89.241, 93.318, 95.566, 98.509, 102.085, 105.017, 107.242, 107.946, 107.948, 107.554, 106.475, 105.517, 104.357, 104.296, 103.946, 102.896, 102.218, 102.796, 102.726, 101.759, 101.336, 100.97, 101.816, 101.736, 100.882, 100.974, 101.784, 101.409, 102.486, 102.971, 103.105, 103.886, 104.559, 104.349, 104.152, 105.461, 106.456, 106.611, 106.827, 108.587, 110.033, 110.993, 113.209, 113.397, 113.575, 113.945, 113.785, 113.473, 112.939, 112.222, 110.297, 108.388, 108.208, 107.125, 105.905, 103.513, 102.305, 102.325, 103.09, 104.299, 104.13, 104.388, 104.854, 106.697, 109.026, 110.97, 112.576, 113.896, 115.206, 116.374, 117.487 )
y <- ts(data=yData, frequency=1, start=c(1, 1))
str(y)
## Time-Series [1:120] from 1 to 120: 1.48 3.06 6.53 9.84 15.73 ...
x <- window(y, end=c(100, 1))
str(x)
## Time-Series [1:100] from 1 to 100: 1.48 3.06 6.53 9.84 15.73 ...
# Plot P/ACF pair of differenced data
astsa::acf2(diff(x))
## ACF PACF
## [1,] 0.83 0.83
## [2,] 0.69 -0.01
## [3,] 0.59 0.05
## [4,] 0.46 -0.13
## [5,] 0.32 -0.14
## [6,] 0.19 -0.08
## [7,] 0.09 0.02
## [8,] -0.02 -0.14
## [9,] -0.10 0.01
## [10,] -0.20 -0.17
## [11,] -0.25 0.08
## [12,] -0.23 0.11
## [13,] -0.22 0.00
## [14,] -0.21 0.00
## [15,] -0.21 -0.12
## [16,] -0.15 0.12
## [17,] -0.10 0.01
## [18,] -0.05 0.03
## [19,] -0.01 -0.02
## [20,] 0.04 0.00
# Fit model - check t-table and diagnostics
astsa::sarima(x, p=1, d=1, q=0)
## initial value 0.591964
## iter 2 value -0.038076
## iter 3 value -0.039015
## iter 4 value -0.039144
## iter 5 value -0.039245
## iter 6 value -0.039461
## iter 7 value -0.039501
## iter 8 value -0.039514
## iter 9 value -0.039528
## iter 10 value -0.039550
## iter 11 value -0.039561
## iter 12 value -0.039564
## iter 13 value -0.039564
## iter 14 value -0.039564
## iter 15 value -0.039564
## iter 16 value -0.039564
## iter 17 value -0.039564
## iter 17 value -0.039564
## iter 17 value -0.039564
## final value -0.039564
## converged
## initial value -0.037148
## iter 2 value -0.037210
## iter 3 value -0.037327
## iter 4 value -0.037336
## iter 5 value -0.037368
## iter 6 value -0.037369
## iter 7 value -0.037369
## iter 8 value -0.037369
## iter 9 value -0.037369
## iter 10 value -0.037369
## iter 10 value -0.037369
## iter 10 value -0.037369
## final value -0.037369
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 constant
## 0.8504 0.9685
## s.e. 0.0525 0.6111
##
## sigma^2 estimated as 0.916: log likelihood = -136.78, aic = 279.55
##
## $degrees_of_freedom
## [1] 98
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.8504 0.0525 16.1970 0.0000
## constant 0.9685 0.6111 1.5849 0.1162
##
## $AIC
## [1] 0.9522847
##
## $AICc
## [1] 0.9747847
##
## $BIC
## [1] 0.004388103
# Forecast the data 20 time periods ahead
astsa::sarima.for(x, n.ahead = 20, p = 1, d = 1, q = 0)
## $pred
## Time Series:
## Start = 101
## End = 120
## Frequency = 1
## [1] 108.8047 107.6805 106.8692 106.3241 106.0054 105.8792 105.9167
## [8] 106.0934 106.3886 106.7844 107.2659 107.8202 108.4365 109.1054
## [15] 109.8192 110.5710 111.3552 112.1670 113.0022 113.8574
##
## $se
## Time Series:
## Start = 101
## End = 120
## Frequency = 1
## [1] 0.9570902 2.0131099 3.1812378 4.4084826 5.6617802 6.9197771
## [7] 8.1684230 9.3984599 10.6038817 11.7809361 12.9274522 14.0423743
## [13] 15.1254331 16.1769097 17.1974643 18.1880125 19.1496340 20.0835066
## [19] 20.9908583 21.8729318
lines(y)
# Fit an ARIMA(0,1,2) to globtemp and check the fit
astsa::sarima(globtemp, p=0, d=1, q=2)
## initial value -2.220513
## iter 2 value -2.294887
## iter 3 value -2.307682
## iter 4 value -2.309170
## iter 5 value -2.310360
## iter 6 value -2.311251
## iter 7 value -2.311636
## iter 8 value -2.311648
## iter 9 value -2.311649
## iter 9 value -2.311649
## iter 9 value -2.311649
## final value -2.311649
## converged
## initial value -2.310187
## iter 2 value -2.310197
## iter 3 value -2.310199
## iter 4 value -2.310201
## iter 5 value -2.310202
## iter 5 value -2.310202
## iter 5 value -2.310202
## final value -2.310202
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ma1 ma2 constant
## -0.3984 -0.2173 0.0072
## s.e. 0.0808 0.0768 0.0033
##
## sigma^2 estimated as 0.00982: log likelihood = 120.32, aic = -232.64
##
## $degrees_of_freedom
## [1] 133
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.3984 0.0808 -4.9313 0.0000
## ma2 -0.2173 0.0768 -2.8303 0.0054
## constant 0.0072 0.0033 2.1463 0.0337
##
## $AIC
## [1] -3.579224
##
## $AICc
## [1] -3.562273
##
## $BIC
## [1] -4.514974
# Forecast data 35 years into the future
astsa::sarima.for(globtemp, n.ahead=35, p=0, d=1, q=2)
## $pred
## Time Series:
## Start = 2016
## End = 2050
## Frequency = 1
## [1] 0.7995567 0.7745381 0.7816919 0.7888457 0.7959996 0.8031534 0.8103072
## [8] 0.8174611 0.8246149 0.8317688 0.8389226 0.8460764 0.8532303 0.8603841
## [15] 0.8675379 0.8746918 0.8818456 0.8889995 0.8961533 0.9033071 0.9104610
## [22] 0.9176148 0.9247687 0.9319225 0.9390763 0.9462302 0.9533840 0.9605378
## [29] 0.9676917 0.9748455 0.9819994 0.9891532 0.9963070 1.0034609 1.0106147
##
## $se
## Time Series:
## Start = 2016
## End = 2050
## Frequency = 1
## [1] 0.09909556 0.11564576 0.12175580 0.12757353 0.13313729 0.13847769
## [7] 0.14361964 0.14858376 0.15338730 0.15804492 0.16256915 0.16697084
## [13] 0.17125943 0.17544322 0.17952954 0.18352490 0.18743511 0.19126540
## [19] 0.19502047 0.19870459 0.20232164 0.20587515 0.20936836 0.21280424
## [25] 0.21618551 0.21951471 0.22279416 0.22602604 0.22921235 0.23235497
## [31] 0.23545565 0.23851603 0.24153763 0.24452190 0.24747019
Chapter 4 - Seasonal ARIMA
Pure Seasonal Models - often collect data with known seasonal patterns (quarterly, monthly, etc.):
Mixed Seasonal Models - purely seasonal models are rare, so the mixed model is more common:
Forecasting Seasonal ARIMA - relatively easy using astsa::sarima.for():
Example code includes:
xData <- c( -3.063, -1.997, -3.925, 5.37, 7.47, 0.502, 2.477, -10.093, -3.462, 1.835, 3.802, 1.853, -1.945, -1.881, -4.783, 4.361, 7.159, 2.699, 0.237, -9.933, -3.406, 0.718, 2.713, 2.309, -1.308, -0.573, -5.37, 3.053, 7.749, 3.926, -0.354, -10.326, -1.302, 1.796, 1.537, 4.596, -0.938, -0.753, -5.059, 3.346, 7.319, 2.802, 0.236, -9.541, -1.466, 3.829, 1.562, 3.934, -0.795, -0.32, -4.607, 2.947, 6.479, 0.403, 0.413, -8.069, -2.512, 4.105, 0.449, 1.274, -0.561, -0.346, -2.933, 2.525, 5.876, -1.374, -0.833, -8.193, -1.465, 5.502, 0.145, 1.336, -0.097, 0.893, -2.447, 2.869, 4.522, -1.133, -0.961, -8.43, -1.324, 6.856, 0.561, 1.842, -0.454, 2.786, -4.908, 2.909, 3.65, -0.681, -1.064, -6.475, 0.313, 6.849, 2.605, 3.129, -0.627, 2.904, -6.023, 1.976, 3.745, -1.207, -0.231, -5.569, 0.116, 4.874, 3.749, 4.216, -0.801, 2.669, -3.866, 3.526, 3.61, -0.298, -0.366, -5.148, -1.465, 2.259, 3.214, 4.789, -0.784, 2.858, -3.764, 3.885, 2.725, 1.297, -1.534, -4.081, -2.081, -0.05, 1.18, 4.582, -2.742, 1.99, -2.828, 4.169, 0.753, 2.19, -1.838, -2.821, -4.067, -1.38, 0.983, 4.561, -3.011, 0.569, -3.255, 2.012, -0.396, 1.63, -1.766, -2.187, -2.507, -1.296, 1.745, 4.975, -3.102, 1.36, -2.611, -0.109, 1.388, 1.727, -2.49, -3.813, -1.957, -0.572, 2.379, 5.92, -5.054, 1.698, -2.621, -1.539, 1.802, 1.932, -1.406, -5.839, -3.011, -0.79, 2.08, 4.144, -6.072, 2.374, -2.659, -2.098, 0.722, 2.443, -1.122, -5.98, -4.85, -0.712, 1.868, 2.127, -6.854, 1.91, -3.205, -1.139, 0.581, 1.527, -2.051, -6.724, -4.612, -1.236, 0.59, 0.828, -7.434, 0.602, -4.288, -1.825, -0.242, 0.107, -2.541, -7.618, -4.066, 0.323, 0.167, 0.145, -6.404, 0.585, -3.075, -3.812, -2.484, 0.783, -2.512, -7.77, -4.389, 2.426, 0.607, 0.47, -5.934, 1.551, -1.288, -3.312, -3.321, 2.478, -1.351, -10.693, -5.375, 3.161, -0.474, 2.11, -6.453, 0.999, -0.473, -2.442, -3.74, 3.271, -2.57, -10.644, -3.972, 2.408, 0.068, 3.375 )
x <- ts(data=xData, frequency=12, start=c(1, 1))
str(x)
## Time-Series [1:252] from 1 to 21.9: -3.06 -2 -3.92 5.37 7.47 ...
# Plot sample P/ACF to lag 60 and compare to the true values
astsa::acf2(x, max.lag = 60)
## ACF PACF
## [1,] 0.13 0.13
## [2,] -0.16 -0.18
## [3,] -0.35 -0.32
## [4,] -0.13 -0.09
## [5,] 0.27 0.22
## [6,] 0.26 0.11
## [7,] 0.29 0.31
## [8,] -0.12 0.03
## [9,] -0.34 -0.18
## [10,] -0.14 -0.02
## [11,] 0.11 0.00
## [12,] 0.89 0.84
## [13,] 0.12 -0.15
## [14,] -0.15 0.12
## [15,] -0.33 0.06
## [16,] -0.12 0.06
## [17,] 0.25 0.01
## [18,] 0.24 -0.06
## [19,] 0.28 -0.10
## [20,] -0.12 -0.06
## [21,] -0.32 0.00
## [22,] -0.11 0.07
## [23,] 0.09 -0.05
## [24,] 0.76 -0.14
## [25,] 0.09 0.00
## [26,] -0.14 -0.02
## [27,] -0.32 -0.03
## [28,] -0.12 -0.05
## [29,] 0.23 -0.03
## [30,] 0.22 0.00
## [31,] 0.25 0.00
## [32,] -0.13 0.02
## [33,] -0.31 -0.02
## [34,] -0.07 0.04
## [35,] 0.08 -0.02
## [36,] 0.65 0.07
## [37,] 0.06 -0.03
## [38,] -0.14 -0.05
## [39,] -0.30 -0.01
## [40,] -0.12 0.01
## [41,] 0.20 -0.13
## [42,] 0.19 -0.03
## [43,] 0.22 -0.06
## [44,] -0.13 -0.03
## [45,] -0.30 -0.02
## [46,] -0.03 0.02
## [47,] 0.06 -0.02
## [48,] 0.56 0.02
## [49,] 0.04 0.08
## [50,] -0.13 0.04
## [51,] -0.29 -0.02
## [52,] -0.11 0.01
## [53,] 0.17 0.02
## [54,] 0.16 -0.03
## [55,] 0.19 0.01
## [56,] -0.14 -0.07
## [57,] -0.28 0.00
## [58,] -0.02 -0.03
## [59,] 0.05 0.01
## [60,] 0.49 0.01
# Fit the seasonal model to x
astsa::sarima(x, p = 0, d = 0, q = 0, P = 1, D = 0, Q = 1, S = 12)
## initial value 1.274226
## iter 2 value 0.228901
## iter 3 value 0.028957
## iter 4 value 0.010808
## iter 5 value -0.002171
## iter 6 value -0.017847
## iter 7 value -0.018632
## iter 8 value -0.018759
## iter 9 value -0.018822
## iter 10 value -0.019245
## iter 11 value -0.019842
## iter 12 value -0.020194
## iter 13 value -0.020236
## iter 14 value -0.020241
## iter 15 value -0.020241
## iter 15 value -0.020241
## final value -0.020241
## converged
## initial value 0.064889
## iter 2 value 0.063302
## iter 3 value 0.061944
## iter 4 value 0.061263
## iter 5 value 0.061164
## iter 6 value 0.061036
## iter 7 value 0.060772
## iter 8 value 0.060428
## iter 9 value 0.060343
## iter 10 value 0.060260
## iter 11 value 0.060192
## iter 12 value 0.060181
## iter 13 value 0.060178
## iter 14 value 0.060174
## iter 15 value 0.060165
## iter 16 value 0.060160
## iter 17 value 0.060159
## iter 18 value 0.060151
## iter 19 value 0.060150
## iter 20 value 0.060149
## iter 21 value 0.060149
## iter 22 value 0.060148
## iter 23 value 0.060148
## iter 24 value 0.060148
## iter 25 value 0.060148
## iter 25 value 0.060148
## iter 25 value 0.060148
## final value 0.060148
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## sar1 sma1 xmean
## 0.9310 0.4825 -0.5765
## s.e. 0.0204 0.0633 0.8797
##
## sigma^2 estimated as 0.9766: log likelihood = -372.73, aic = 753.46
##
## $degrees_of_freedom
## [1] 249
##
## $ttable
## Estimate SE t.value p.value
## sar1 0.9310 0.0204 45.6128 0.0000
## sma1 0.4825 0.0633 7.6187 0.0000
## xmean -0.5765 0.8797 -0.6553 0.5129
##
## $AIC
## [1] 1.000131
##
## $AICc
## [1] 1.00871
##
## $BIC
## [1] 0.04214768
xData <- c( -1.243, -0.68, 1.356, 0.843, -0.409, 1.062, -1.08, 3.002, 0.812, -0.388, -1.788, 2.321, -3.264, 0.866, -0.004, 0.289, 0.855, 1.445, -1.085, 2.426, -2.201, -1.014, 0.127, 1.326, -2.958, 2.635, -1.209, 0.288, 0.025, 2.225, -0.792, 2.58, -2.44, -1.961, 1.732, -0.62, -1.063, 1.148, -0.553, 1.192, -1.642, 0.836, 1.022, 0.844, 0.407, -1.239, -0.093, -0.918, -0.543, 0.017, 0.218, 1.895, -1.628, 1.092, 1.425, -0.962, -1.407, 0.58, 0.128, -0.509, -0.38, 0.886, -1.135, 2.319, -1.199, 2.7, 0.34, -1.393, -1.553, 1.149, 1.95, -0.563, -1.746, 2.44, -1.449, 0.306, 0.495, 2.17, 1.035, 0.186, 0.044, 0.972, -1.724, 1.314, -1.912, 1.81, 1.111, -1.517, 2.95, -1.682, 2.422, -1.526, 0.372, -0.503, -0.16, -1.42, -0.826, 1.201, 1.764, -1.759, 3.392, -0.873, 1.489, -2.768, 0.442, 0.171, -1.117, -0.757, 0.756, 0.931, -0.832, 1.028, 1.176, -0.27, 0.818, -2.096, -0.234, 0.31, -1.018, 2.883, -1.119, 0.201, -0.495, 1.506, -0.696, 0.021, 0.461, -2.817, 0.665, -0.77, 2.283, 0.635, -2.876, -0.201, 1.109, 0.666, 0.096, -0.776, -2.022, 2.101, -0.861, -1.659, 3.324, -0.428, 0.002, -0.063, 0.081, -0.034, -1.022, 0.247, -2.832, 4.967, -2.348, -1.963, 2.966, 0.317, 0.678, -1.146, -0.279, 1.632, -3.308, 1.183, 0.875, 1.941, -1.427, -1.036, 1.195, 1.425, 1.126, -3.354, 1.025, 0.976, -1.01, -1.437, 2.349, -0.452, 0.269, -0.245, -1.107, 2.442, -0.544, -0.114, -0.121, 1.017, -1.107, -0.679, 0.356, -0.535, 0.584, 1.075, -1.73, 1.321, -1.503, 0.797, -0.713, 1.599, -1.551, 1.462, -1.566, -2.094, 1.159, 1.52, 0.528, -0.48, 0.02, -0.357, 1.088, -0.936, 2.707, -0.053, -1.876, -1.162, 2.719, -0.818, -0.351, 0.459, 0.65, -0.735, 2.805, -1.153, 2.171, -0.007, -0.54, -1.186, 1.694, 0.491, -3.27, 1.605, -0.256, 0.235, 2.334, 1.164, -2.024, -0.174, 1.588, -3.079, -1.286, 2.68, -2.625, 0.28, -0.91, 0.789, 1.677, 1.291, -2.935, 0.587, 0.783, -0.749, -0.455, 1.181, -0.221, -1.713 )
x <- ts(data=xData, frequency=12, start=c(1, 1))
str(x)
## Time-Series [1:252] from 1 to 21.9: -1.243 -0.68 1.356 0.843 -0.409 ...
# Plot sample P/ACF pair to lag 60 and compare to actual
astsa::acf2(x, max.lag=60)
## ACF PACF
## [1,] -0.41 -0.41
## [2,] -0.03 -0.24
## [3,] 0.00 -0.14
## [4,] 0.06 -0.01
## [5,] -0.10 -0.10
## [6,] 0.00 -0.11
## [7,] -0.06 -0.17
## [8,] 0.04 -0.11
## [9,] 0.04 -0.02
## [10,] -0.06 -0.08
## [11,] -0.13 -0.28
## [12,] 0.46 0.33
## [13,] -0.19 0.20
## [14,] -0.03 0.12
## [15,] 0.01 0.09
## [16,] 0.01 0.02
## [17,] 0.00 0.13
## [18,] -0.14 -0.08
## [19,] 0.02 -0.08
## [20,] 0.03 -0.06
## [21,] 0.04 -0.04
## [22,] -0.07 -0.04
## [23,] 0.13 0.21
## [24,] -0.06 -0.21
## [25,] 0.06 -0.06
## [26,] -0.03 -0.01
## [27,] -0.05 -0.10
## [28,] 0.05 0.06
## [29,] 0.00 -0.03
## [30,] -0.11 0.06
## [31,] -0.03 -0.05
## [32,] 0.07 -0.03
## [33,] -0.06 -0.12
## [34,] 0.05 -0.02
## [35,] 0.06 -0.13
## [36,] -0.04 0.08
## [37,] 0.05 0.01
## [38,] -0.02 -0.03
## [39,] -0.07 0.01
## [40,] 0.07 -0.05
## [41,] -0.09 -0.09
## [42,] 0.10 0.11
## [43,] -0.11 0.05
## [44,] 0.06 0.03
## [45,] -0.08 0.03
## [46,] 0.12 0.04
## [47,] -0.03 0.16
## [48,] -0.04 -0.12
## [49,] 0.05 -0.06
## [50,] -0.04 -0.05
## [51,] 0.04 0.03
## [52,] -0.07 -0.03
## [53,] -0.07 -0.09
## [54,] 0.17 -0.01
## [55,] -0.09 0.00
## [56,] -0.03 -0.05
## [57,] 0.04 -0.03
## [58,] 0.05 -0.03
## [59,] 0.00 -0.03
## [60,] -0.07 0.07
# Fit the seasonal model to x
astsa::sarima(x, p=0, d=0, q=1, P=0, D=0, Q=1, S=12)
## initial value 0.403514
## iter 2 value 0.107253
## iter 3 value 0.063347
## iter 4 value 0.050288
## iter 5 value 0.044945
## iter 6 value 0.041690
## iter 7 value 0.041311
## iter 8 value 0.041284
## iter 9 value 0.041280
## iter 10 value 0.041271
## iter 11 value 0.041271
## iter 12 value 0.041271
## iter 12 value 0.041271
## iter 12 value 0.041271
## final value 0.041271
## converged
## initial value 0.030505
## iter 2 value 0.027716
## iter 3 value 0.026597
## iter 4 value 0.026568
## iter 5 value 0.026568
## iter 6 value 0.026568
## iter 7 value 0.026567
## iter 7 value 0.026567
## iter 7 value 0.026567
## final value 0.026567
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 sma1 xmean
## -0.6142 0.7887 0.0784
## s.e. 0.0564 0.0475 0.0430
##
## sigma^2 estimated as 1.005: log likelihood = -364.27, aic = 736.54
##
## $degrees_of_freedom
## [1] 249
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.6142 0.0564 -10.8811 0.0000
## sma1 0.7887 0.0475 16.6073 0.0000
## xmean 0.0784 0.0430 1.8250 0.0692
##
## $AIC
## [1] 1.028746
##
## $AICc
## [1] 1.037325
##
## $BIC
## [1] 0.07076309
data(unemp, package="astsa")
str(unemp)
## Time-Series [1:372] from 1948 to 1979: 235 281 265 241 201 ...
# Plot unemp
plot(unemp)
# Difference your data and plot it
d_unemp <- diff(unemp)
plot(d_unemp)
# Seasonally difference d_unemp and plot it
dd_unemp <- diff(d_unemp, lag = 12)
plot(dd_unemp)
# Plot P/ACF pair of fully differenced data to lag 60
dd_unemp <- diff(diff(unemp), lag = 12)
astsa::acf2(dd_unemp, max.lag=60)
## ACF PACF
## [1,] 0.21 0.21
## [2,] 0.33 0.29
## [3,] 0.15 0.05
## [4,] 0.17 0.05
## [5,] 0.10 0.01
## [6,] 0.06 -0.02
## [7,] -0.06 -0.12
## [8,] -0.02 -0.03
## [9,] -0.09 -0.05
## [10,] -0.17 -0.15
## [11,] -0.08 0.02
## [12,] -0.48 -0.43
## [13,] -0.18 -0.02
## [14,] -0.16 0.15
## [15,] -0.11 0.03
## [16,] -0.15 -0.04
## [17,] -0.09 -0.01
## [18,] -0.09 0.00
## [19,] 0.03 0.01
## [20,] -0.01 0.01
## [21,] 0.02 -0.01
## [22,] -0.02 -0.16
## [23,] 0.01 0.01
## [24,] -0.02 -0.27
## [25,] 0.09 0.05
## [26,] -0.05 -0.01
## [27,] -0.01 -0.05
## [28,] 0.03 0.05
## [29,] 0.08 0.09
## [30,] 0.01 -0.04
## [31,] 0.03 0.02
## [32,] -0.05 -0.07
## [33,] 0.01 -0.01
## [34,] 0.02 -0.08
## [35,] -0.06 -0.08
## [36,] -0.02 -0.23
## [37,] -0.12 -0.08
## [38,] 0.01 0.06
## [39,] -0.03 -0.07
## [40,] -0.03 -0.01
## [41,] -0.10 0.03
## [42,] -0.02 -0.03
## [43,] -0.13 -0.11
## [44,] 0.00 -0.04
## [45,] -0.06 0.01
## [46,] 0.01 0.00
## [47,] 0.02 -0.03
## [48,] 0.11 -0.04
## [49,] 0.13 0.02
## [50,] 0.10 0.03
## [51,] 0.07 -0.05
## [52,] 0.10 0.02
## [53,] 0.12 0.02
## [54,] 0.06 -0.08
## [55,] 0.14 0.00
## [56,] 0.05 -0.03
## [57,] 0.04 -0.07
## [58,] 0.04 0.05
## [59,] 0.07 0.04
## [60,] -0.03 -0.04
# Fit an appropriate model
astsa::sarima(unemp, p=2, d=1, q=0, P=0, D=1, Q=1, S=12)
## initial value 3.340809
## iter 2 value 3.105512
## iter 3 value 3.086631
## iter 4 value 3.079778
## iter 5 value 3.069447
## iter 6 value 3.067659
## iter 7 value 3.067426
## iter 8 value 3.067418
## iter 8 value 3.067418
## final value 3.067418
## converged
## initial value 3.065481
## iter 2 value 3.065478
## iter 3 value 3.065477
## iter 3 value 3.065477
## iter 3 value 3.065477
## final value 3.065477
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), include.mean = !no.constant, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ar2 sma1
## 0.1351 0.2464 -0.6953
## s.e. 0.0513 0.0515 0.0381
##
## sigma^2 estimated as 449.6: log likelihood = -1609.91, aic = 3227.81
##
## $degrees_of_freedom
## [1] 369
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.1351 0.0513 2.6326 0.0088
## ar2 0.2464 0.0515 4.7795 0.0000
## sma1 -0.6953 0.0381 -18.2362 0.0000
##
## $AIC
## [1] 7.12457
##
## $AICc
## [1] 7.130239
##
## $BIC
## [1] 6.156174
data(chicken, package="astsa")
str(chicken)
## Time-Series [1:180] from 2002 to 2016: 65.6 66.5 65.7 64.3 63.2 ...
# Plot differenced chicken
plot(diff(chicken))
# Plot P/ACF pair of differenced data to lag 60
astsa::acf2(diff(chicken), max.lag=60)
## ACF PACF
## [1,] 0.72 0.72
## [2,] 0.39 -0.29
## [3,] 0.09 -0.14
## [4,] -0.07 0.03
## [5,] -0.16 -0.10
## [6,] -0.20 -0.06
## [7,] -0.27 -0.19
## [8,] -0.23 0.12
## [9,] -0.11 0.10
## [10,] 0.09 0.16
## [11,] 0.26 0.09
## [12,] 0.33 0.00
## [13,] 0.20 -0.22
## [14,] 0.07 0.03
## [15,] -0.03 0.03
## [16,] -0.10 -0.11
## [17,] -0.19 -0.09
## [18,] -0.25 0.01
## [19,] -0.29 -0.03
## [20,] -0.20 0.07
## [21,] -0.08 -0.04
## [22,] 0.08 0.06
## [23,] 0.16 -0.05
## [24,] 0.18 0.02
## [25,] 0.08 -0.14
## [26,] -0.06 -0.19
## [27,] -0.21 -0.13
## [28,] -0.31 -0.06
## [29,] -0.40 -0.08
## [30,] -0.40 -0.05
## [31,] -0.33 0.01
## [32,] -0.18 0.03
## [33,] 0.02 0.10
## [34,] 0.20 0.02
## [35,] 0.30 -0.01
## [36,] 0.35 0.09
## [37,] 0.26 -0.12
## [38,] 0.13 0.01
## [39,] -0.02 -0.01
## [40,] -0.14 -0.05
## [41,] -0.23 0.02
## [42,] -0.21 0.12
## [43,] -0.18 -0.05
## [44,] -0.11 -0.13
## [45,] -0.03 -0.07
## [46,] 0.08 0.01
## [47,] 0.21 0.14
## [48,] 0.33 0.05
## [49,] 0.26 -0.20
## [50,] 0.12 -0.01
## [51,] -0.01 0.07
## [52,] -0.11 -0.04
## [53,] -0.13 0.02
## [54,] -0.09 0.00
## [55,] -0.09 -0.08
## [56,] -0.06 0.03
## [57,] 0.03 0.04
## [58,] 0.17 0.00
## [59,] 0.29 0.01
## [60,] 0.32 0.03
# Fit ARIMA(2,1,0) to chicken - not so good
astsa::sarima(chicken, p=2, d=1, q=0)
## initial value 0.001863
## iter 2 value -0.156034
## iter 3 value -0.359181
## iter 4 value -0.424164
## iter 5 value -0.430212
## iter 6 value -0.432744
## iter 7 value -0.432747
## iter 8 value -0.432749
## iter 9 value -0.432749
## iter 10 value -0.432751
## iter 11 value -0.432752
## iter 12 value -0.432752
## iter 13 value -0.432752
## iter 13 value -0.432752
## iter 13 value -0.432752
## final value -0.432752
## converged
## initial value -0.420883
## iter 2 value -0.420934
## iter 3 value -0.420936
## iter 4 value -0.420937
## iter 5 value -0.420937
## iter 6 value -0.420937
## iter 6 value -0.420937
## iter 6 value -0.420937
## final value -0.420937
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ar2 constant
## 0.9494 -0.3069 0.2632
## s.e. 0.0717 0.0718 0.1362
##
## sigma^2 estimated as 0.4286: log likelihood = -178.64, aic = 365.28
##
## $degrees_of_freedom
## [1] 177
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.9494 0.0717 13.2339 0.0000
## ar2 -0.3069 0.0718 -4.2723 0.0000
## constant 0.2632 0.1362 1.9328 0.0549
##
## $AIC
## [1] 0.1861622
##
## $AICc
## [1] 0.1985432
##
## $BIC
## [1] -0.7606218
# Fit SARIMA(2,1,0,1,0,0,12) to chicken - that works
astsa::sarima(chicken, p=2, d=1, q=0, P=1, D=0, Q=0, S=12)
## initial value 0.015039
## iter 2 value -0.226398
## iter 3 value -0.412955
## iter 4 value -0.460882
## iter 5 value -0.470787
## iter 6 value -0.471082
## iter 7 value -0.471088
## iter 8 value -0.471090
## iter 9 value -0.471092
## iter 10 value -0.471095
## iter 11 value -0.471095
## iter 12 value -0.471096
## iter 13 value -0.471096
## iter 14 value -0.471096
## iter 15 value -0.471097
## iter 16 value -0.471097
## iter 16 value -0.471097
## iter 16 value -0.471097
## final value -0.471097
## converged
## initial value -0.473585
## iter 2 value -0.473664
## iter 3 value -0.473721
## iter 4 value -0.473823
## iter 5 value -0.473871
## iter 6 value -0.473885
## iter 7 value -0.473886
## iter 8 value -0.473886
## iter 8 value -0.473886
## iter 8 value -0.473886
## final value -0.473886
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ar2 sar1 constant
## 0.9154 -0.2494 0.3237 0.2353
## s.e. 0.0733 0.0739 0.0715 0.1973
##
## sigma^2 estimated as 0.3828: log likelihood = -169.16, aic = 348.33
##
## $degrees_of_freedom
## [1] 176
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.9154 0.0733 12.4955 0.0000
## ar2 -0.2494 0.0739 -3.3728 0.0009
## sar1 0.3237 0.0715 4.5238 0.0000
## constant 0.2353 0.1973 1.1923 0.2347
##
## $AIC
## [1] 0.0842377
##
## $AICc
## [1] 0.09726452
##
## $BIC
## [1] -0.8448077
data(birth, package="astsa")
str(birth)
## Time-Series [1:373] from 1948 to 1979: 295 286 300 278 272 268 308 321 313 308 ...
# Plot P/ACF to lag 60 of differenced data
d_birth <- diff(birth)
astsa::acf2(d_birth, max.lag=60)
## ACF PACF
## [1,] -0.32 -0.32
## [2,] 0.16 0.06
## [3,] -0.08 -0.01
## [4,] -0.19 -0.25
## [5,] 0.09 -0.03
## [6,] -0.28 -0.26
## [7,] 0.06 -0.17
## [8,] -0.19 -0.29
## [9,] -0.05 -0.35
## [10,] 0.17 -0.16
## [11,] -0.26 -0.59
## [12,] 0.82 0.57
## [13,] -0.28 0.13
## [14,] 0.17 0.11
## [15,] -0.07 0.13
## [16,] -0.18 0.09
## [17,] 0.08 0.00
## [18,] -0.28 0.00
## [19,] 0.07 0.05
## [20,] -0.18 0.04
## [21,] -0.05 -0.07
## [22,] 0.16 -0.10
## [23,] -0.24 -0.20
## [24,] 0.78 0.19
## [25,] -0.27 0.01
## [26,] 0.19 0.05
## [27,] -0.08 0.07
## [28,] -0.17 0.07
## [29,] 0.07 -0.02
## [30,] -0.29 -0.06
## [31,] 0.07 -0.02
## [32,] -0.15 0.09
## [33,] -0.04 0.03
## [34,] 0.14 -0.06
## [35,] -0.24 -0.16
## [36,] 0.75 0.03
## [37,] -0.23 0.08
## [38,] 0.16 -0.10
## [39,] -0.08 -0.03
## [40,] -0.15 0.07
## [41,] 0.05 -0.04
## [42,] -0.25 0.06
## [43,] 0.06 0.04
## [44,] -0.18 -0.07
## [45,] -0.03 -0.06
## [46,] 0.15 0.02
## [47,] -0.22 -0.04
## [48,] 0.72 0.10
## [49,] -0.24 0.01
## [50,] 0.16 0.00
## [51,] -0.08 -0.03
## [52,] -0.13 0.04
## [53,] 0.05 0.03
## [54,] -0.26 0.00
## [55,] 0.05 -0.01
## [56,] -0.17 0.01
## [57,] -0.02 0.03
## [58,] 0.15 0.04
## [59,] -0.23 -0.09
## [60,] 0.70 0.04
# Plot P/ACF to lag 60 of seasonal differenced data
dd_birth <- diff(d_birth, lag = 12)
astsa::acf2(dd_birth, max.lag=60)
## ACF PACF
## [1,] -0.30 -0.30
## [2,] -0.09 -0.20
## [3,] -0.09 -0.21
## [4,] 0.00 -0.14
## [5,] 0.07 -0.03
## [6,] 0.03 0.02
## [7,] -0.07 -0.06
## [8,] -0.04 -0.08
## [9,] 0.11 0.06
## [10,] 0.04 0.08
## [11,] 0.13 0.23
## [12,] -0.43 -0.32
## [13,] 0.14 -0.06
## [14,] -0.01 -0.13
## [15,] 0.03 -0.13
## [16,] 0.01 -0.11
## [17,] 0.02 0.02
## [18,] 0.00 0.06
## [19,] 0.03 0.04
## [20,] -0.07 -0.10
## [21,] -0.01 0.02
## [22,] 0.00 0.00
## [23,] 0.06 0.17
## [24,] -0.01 -0.13
## [25,] -0.12 -0.14
## [26,] 0.17 0.07
## [27,] -0.04 -0.04
## [28,] 0.03 -0.02
## [29,] -0.05 0.02
## [30,] -0.09 -0.06
## [31,] -0.01 -0.07
## [32,] 0.19 0.05
## [33,] -0.03 0.07
## [34,] -0.09 -0.06
## [35,] -0.02 0.05
## [36,] -0.04 -0.16
## [37,] 0.17 -0.01
## [38,] -0.14 -0.04
## [39,] 0.03 -0.01
## [40,] -0.05 -0.03
## [41,] 0.03 -0.01
## [42,] 0.10 0.01
## [43,] 0.00 0.00
## [44,] -0.10 0.03
## [45,] -0.03 -0.02
## [46,] 0.06 -0.07
## [47,] 0.02 0.05
## [48,] 0.01 -0.11
## [49,] -0.01 0.05
## [50,] 0.06 0.06
## [51,] -0.08 -0.03
## [52,] 0.03 -0.03
## [53,] 0.01 0.04
## [54,] -0.02 0.02
## [55,] -0.01 -0.04
## [56,] 0.00 -0.01
## [57,] -0.07 -0.13
## [58,] 0.17 0.07
## [59,] -0.04 0.07
## [60,] -0.01 -0.05
# Fit SARIMA(0,1,1)x(0,1,1)_12. What happens?
astsa::sarima(birth, p=0, d=1, q=1, P=0, D=1, Q=1, S=12)
## initial value 2.219164
## iter 2 value 2.013310
## iter 3 value 1.988107
## iter 4 value 1.980026
## iter 5 value 1.967594
## iter 6 value 1.965384
## iter 7 value 1.965049
## iter 8 value 1.964993
## iter 9 value 1.964992
## iter 9 value 1.964992
## iter 9 value 1.964992
## final value 1.964992
## converged
## initial value 1.951264
## iter 2 value 1.945867
## iter 3 value 1.945729
## iter 4 value 1.945723
## iter 5 value 1.945723
## iter 5 value 1.945723
## iter 5 value 1.945723
## final value 1.945723
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), include.mean = !no.constant, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 sma1
## -0.4734 -0.7861
## s.e. 0.0598 0.0451
##
## sigma^2 estimated as 47.4: log likelihood = -1211.28, aic = 2428.56
##
## $degrees_of_freedom
## [1] 371
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.4734 0.0598 -7.9097 0
## sma1 -0.7861 0.0451 -17.4227 0
##
## $AIC
## [1] 4.869388
##
## $AICc
## [1] 4.874924
##
## $BIC
## [1] 3.890415
# Add AR term and conclude
astsa::sarima(birth, p=1, d=1, q=1, P=0, D=1, Q=1, S=12)
## initial value 2.218186
## iter 2 value 2.032584
## iter 3 value 1.982464
## iter 4 value 1.975643
## iter 5 value 1.971721
## iter 6 value 1.967284
## iter 7 value 1.963840
## iter 8 value 1.961106
## iter 9 value 1.960849
## iter 10 value 1.960692
## iter 11 value 1.960683
## iter 12 value 1.960675
## iter 13 value 1.960672
## iter 13 value 1.960672
## iter 13 value 1.960672
## final value 1.960672
## converged
## initial value 1.940459
## iter 2 value 1.934425
## iter 3 value 1.932752
## iter 4 value 1.931750
## iter 5 value 1.931074
## iter 6 value 1.930882
## iter 7 value 1.930860
## iter 8 value 1.930859
## iter 8 value 1.930859
## final value 1.930859
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), include.mean = !no.constant, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ma1 sma1
## 0.3038 -0.7006 -0.8000
## s.e. 0.0865 0.0604 0.0441
##
## sigma^2 estimated as 45.91: log likelihood = -1205.93, aic = 2419.85
##
## $degrees_of_freedom
## [1] 370
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.3038 0.0865 3.5104 5e-04
## ma1 -0.7006 0.0604 -11.5984 0e+00
## sma1 -0.8000 0.0441 -18.1302 0e+00
##
## $AIC
## [1] 4.842869
##
## $AICc
## [1] 4.848523
##
## $BIC
## [1] 3.87441
data(unemp, package="astsa")
str(unemp)
## Time-Series [1:372] from 1948 to 1979: 235 281 265 241 201 ...
# Fit your previous model to unemp and check the diagnostics
astsa::sarima(unemp, p=2, d=1, q=0, P=0, D=1, Q=1, S=12)
## initial value 3.340809
## iter 2 value 3.105512
## iter 3 value 3.086631
## iter 4 value 3.079778
## iter 5 value 3.069447
## iter 6 value 3.067659
## iter 7 value 3.067426
## iter 8 value 3.067418
## iter 8 value 3.067418
## final value 3.067418
## converged
## initial value 3.065481
## iter 2 value 3.065478
## iter 3 value 3.065477
## iter 3 value 3.065477
## iter 3 value 3.065477
## final value 3.065477
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), include.mean = !no.constant, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ar2 sma1
## 0.1351 0.2464 -0.6953
## s.e. 0.0513 0.0515 0.0381
##
## sigma^2 estimated as 449.6: log likelihood = -1609.91, aic = 3227.81
##
## $degrees_of_freedom
## [1] 369
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.1351 0.0513 2.6326 0.0088
## ar2 0.2464 0.0515 4.7795 0.0000
## sma1 -0.6953 0.0381 -18.2362 0.0000
##
## $AIC
## [1] 7.12457
##
## $AICc
## [1] 7.130239
##
## $BIC
## [1] 6.156174
# Forecast the data 3 years into the future
astsa::sarima.for(unemp, p=2, d=1, q=0, P=0, D=1, Q=1, S=12, n.ahead=36)
## $pred
## Jan Feb Mar Apr May Jun Jul
## 1979 676.4664 685.1172 653.2388 585.6939 553.8813 664.4072 647.0657
## 1980 683.3045 687.7649 654.8658 586.1507 553.9285 664.1108 646.6220
## 1981 682.6406 687.0977 654.1968 585.4806 553.2579 663.4398 645.9508
## Aug Sep Oct Nov Dec
## 1979 611.0828 594.6414 569.3997 587.5801 581.1833
## 1980 610.5345 594.0427 568.7684 586.9320 580.5249
## 1981 609.8632 593.3713 568.0970 586.2606 579.8535
##
## $se
## Jan Feb Mar Apr May Jun Jul
## 1979 21.20465 32.07710 43.70167 53.66329 62.85364 71.12881 78.73590
## 1980 116.99599 124.17344 131.51281 138.60466 145.49706 152.12863 158.52302
## 1981 194.25167 201.10648 208.17066 215.11503 221.96039 228.64285 235.16874
## Aug Sep Oct Nov Dec
## 1979 85.75096 92.28663 98.41329 104.19488 109.67935
## 1980 164.68623 170.63839 176.39520 181.97333 187.38718
## 1981 241.53258 247.74268 253.80549 259.72970 265.52323
data(chicken, package="astsa")
str(chicken)
## Time-Series [1:180] from 2002 to 2016: 65.6 66.5 65.7 64.3 63.2 ...
# Fit the chicken model again and check diagnostics
astsa::sarima(chicken, p=2, d=1, q=0, P=1, D=0, Q=0, S=12)
## initial value 0.015039
## iter 2 value -0.226398
## iter 3 value -0.412955
## iter 4 value -0.460882
## iter 5 value -0.470787
## iter 6 value -0.471082
## iter 7 value -0.471088
## iter 8 value -0.471090
## iter 9 value -0.471092
## iter 10 value -0.471095
## iter 11 value -0.471095
## iter 12 value -0.471096
## iter 13 value -0.471096
## iter 14 value -0.471096
## iter 15 value -0.471097
## iter 16 value -0.471097
## iter 16 value -0.471097
## iter 16 value -0.471097
## final value -0.471097
## converged
## initial value -0.473585
## iter 2 value -0.473664
## iter 3 value -0.473721
## iter 4 value -0.473823
## iter 5 value -0.473871
## iter 6 value -0.473885
## iter 7 value -0.473886
## iter 8 value -0.473886
## iter 8 value -0.473886
## iter 8 value -0.473886
## final value -0.473886
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ar2 sar1 constant
## 0.9154 -0.2494 0.3237 0.2353
## s.e. 0.0733 0.0739 0.0715 0.1973
##
## sigma^2 estimated as 0.3828: log likelihood = -169.16, aic = 348.33
##
## $degrees_of_freedom
## [1] 176
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.9154 0.0733 12.4955 0.0000
## ar2 -0.2494 0.0739 -3.3728 0.0009
## sar1 0.3237 0.0715 4.5238 0.0000
## constant 0.2353 0.1973 1.1923 0.2347
##
## $AIC
## [1] 0.0842377
##
## $AICc
## [1] 0.09726452
##
## $BIC
## [1] -0.8448077
# Forecast the chicken data 5 years into the future
astsa::sarima.for(chicken, p=2, d=1, q=0, P=1, D=0, Q=0, S=12, n.ahead=60)
## $pred
## Jan Feb Mar Apr May Jun Jul
## 2016
## 2017 110.5358 110.5612 110.5480 110.7055 111.0047 111.1189 111.1552
## 2018 111.8108 111.9782 112.1330 112.3431 112.5991 112.7952 112.9661
## 2019 114.1331 114.3464 114.5556 114.7827 115.0247 115.2473 115.4617
## 2020 116.7942 117.0224 117.2492 117.4819 117.7193 117.9505 118.1790
## 2021 119.5651 119.7980 120.0306 120.2650 120.5010 120.7350 120.9681
## Aug Sep Oct Nov Dec
## 2016 111.0907 110.8740 110.6853 110.5045 110.5527
## 2017 111.1948 111.2838 111.3819 111.4825 111.6572
## 2018 113.1380 113.3260 113.5168 113.7085 113.9242
## 2019 115.6765 115.8965 116.1174 116.3386 116.5675
## 2020 118.4077 118.6380 118.8686 119.0993 119.3326
## 2021
##
## $se
## Jan Feb Mar Apr May Jun
## 2016
## 2017 3.7414959 4.1793190 4.5747009 4.9373266 5.2742129 5.5903499
## 2018 8.2010253 8.5605811 8.9054714 9.2372195 9.5572539 9.8667955
## 2019 12.0038164 12.2921541 12.5738417 12.8492868 13.1188976 13.3830477
## 2020 15.1557253 15.3959082 15.6323906 15.8653300 16.0948844 16.3212022
## 2021 17.8397890 18.0473081 18.2524651 18.4553364 18.6559977 18.8545213
## Jul Aug Sep Oct Nov Dec
## 2016 0.6187194 1.3368594 2.0462419 2.6867986 3.2486625
## 2017 5.8893133 6.2367345 6.6253573 7.0309771 7.4344077 7.8255932
## 2018 10.1668604 10.4736807 10.7857727 11.0980056 11.4063211 11.7085266
## 2019 13.6420693 13.9002670 14.1573839 14.4122197 14.6638269 14.9117124
## 2020 16.5444204 16.7657634 16.9852163 17.2025022 17.4174076 17.6298379
## 2021 19.0509752
Chapter 1 - Introduction to Bayesian Thinking
Discrete probability distributions - two schools of thought, frequentist and Bayesian:
Bayes’ rule - Presbyterian minister Thomas Bayes was a mathematician in his spare time:
Sequential Bayes - the posterior after the first trial becomes the prior for sequential trials:
Example code includes:
# Define a spinner with five regions: regions
regions <- c(1, 1, 1, 1, 1)
# Plot the spinner
TeachBayes::spinner_plot(regions)
# Show the probability distribution
TeachBayes::spinner_probs(regions)
## Region Prob
## 1 1 0.2
## 2 2 0.2
## 3 3 0.2
## 4 4 0.2
## 5 5 0.2
# Define new spinner: regions
regions <- c(2, 2, 4)
# Simulation 1000 spins: spins
spins <- TeachBayes::spinner_data(regions, nsim=1000)
# Graph the spin data using bar_plot()
TeachBayes::bar_plot(spins)
# Construct frequency table of spins
table(spins)
## spins
## 1 2 3
## 241 254 505
# Find fraction of spins equal to 2
mean(spins == 2)
## [1] 0.254
# Find mean spin value
mean(spins)
## [1] 2.264
# Create the vector of models: Model
Model <- c("Spinner A", "Spinner B")
# Define the vector of prior probabilities: Prior
Prior <- c(0.5, 0.5)
# Define the vector of likelihoods: Likelihood
Likelihood <- c(1/2, 1/6)
# Make a data frame with variables Model, Prior, Likelihood: bayes_df
bayes_df <- data.frame(Model, Prior, Likelihood, stringsAsFactors=FALSE)
str(bayes_df)
## 'data.frame': 2 obs. of 3 variables:
## $ Model : chr "Spinner A" "Spinner B"
## $ Prior : num 0.5 0.5
## $ Likelihood: num 0.5 0.167
# Compute the posterior probabilities
TeachBayes::bayesian_crank(bayes_df)
## Model Prior Likelihood Product Posterior
## 1 Spinner A 0.5 0.5000000 0.25000000 0.75
## 2 Spinner B 0.5 0.1666667 0.08333333 0.25
TeachBayes::prior_post_plot( TeachBayes::bayesian_crank(bayes_df) )
# Display the vector of models: Model
Model <- c("Spinner A", "Spinner B")
# Define the vector of prior probabilities: Prior
Prior <- c(0.75, 0.25)
# Define the vector of likelihoods: Likelihood
Likelihood <- c(1/2, 1/6)
# Make a data frame with variables Model, Prior, Likelihood: bayes_df
bayes_df <- data.frame(Model, Prior, Likelihood, stringsAsFactors=FALSE)
str(bayes_df)
## 'data.frame': 2 obs. of 3 variables:
## $ Model : chr "Spinner A" "Spinner B"
## $ Prior : num 0.75 0.25
## $ Likelihood: num 0.5 0.167
# Compute the posterior probabilities
TeachBayes::bayesian_crank(bayes_df)
## Model Prior Likelihood Product Posterior
## 1 Spinner A 0.75 0.5000000 0.37500000 0.9
## 2 Spinner B 0.25 0.1666667 0.04166667 0.1
Chapter 2 - Binomial Probability
Bayes with discrete models - example of “percentage, p, of people who prefer discrete time period X for activity Y”:
Bayes with continuous priors - continuing with the example of “percentage, p, of people who prefer discrete time period X for activity Y”:
Updating the beta prior - the product of the beta-curve prior and the binomial likelihoods is again a beta-curve:
Bayesian inference - all inferences are based on various summarizations of the posterior beta-curve:
Posterior simulation - can simulate from the posterior probability using rbeta():
Example code includes:
# Define the values of the proportion: P
P <- c(0.5, 0.6, 0.7, 0.8, 0.9)
# Define Madison's prior: Prior
Prior <- c(0.3, 0.3, 0.2, 0.1, 0.1)
# Compute the likelihoods: Likelihood
Likelihood <- dbinom(16, size=20, prob=P)
# Create Bayes data frame: bayes_df
bayes_df <- data.frame(P, Prior, Likelihood)
str(bayes_df)
## 'data.frame': 5 obs. of 3 variables:
## $ P : num 0.5 0.6 0.7 0.8 0.9
## $ Prior : num 0.3 0.3 0.2 0.1 0.1
## $ Likelihood: num 0.00462 0.03499 0.13042 0.2182 0.08978
# Compute the posterior probabilities: bayes_df
bayes_df <- TeachBayes::bayesian_crank(bayes_df)
str(bayes_df)
## 'data.frame': 5 obs. of 5 variables:
## $ P : num 0.5 0.6 0.7 0.8 0.9
## $ Prior : num 0.3 0.3 0.2 0.1 0.1
## $ Likelihood: num 0.00462 0.03499 0.13042 0.2182 0.08978
## $ Product : num 0.00139 0.0105 0.02608 0.02182 0.00898
## $ Posterior : num 0.0202 0.1527 0.3793 0.3173 0.1306
# Graphically compare the prior and posterior
TeachBayes::prior_post_plot(bayes_df)
# Find the probability that P is smaller than 0.85
pbeta(0.85, 8.13, 3.67)
## [1] 0.9000721
# Find the probability that P is larger than 0.85
pbeta(0.85, 8.13, 3.67, lower.tail=FALSE)
## [1] 0.09992792
# Find the 0.75 quantile of P
qbeta(0.75, 8.13, 3.67)
## [1] 0.785503
# Specify that the 0.25 quantile of P is equal to 0.7: quantile1
quantile1 <- list(p=0.25, x=0.7)
# Specify that the 0.75 quantile of P is equal to 0.85: quantile2
quantile2 <- list(p=0.75, x=0.85)
# Find the beta shape parameters matching the two quantiles: ab
ab <- LearnBayes::beta.select(quantile1, quantile2)
# Plot the beta curve using the beta_draw() function
TeachBayes::beta_draw(ab)
# Harry's shape parameters for his prior: ab
ab <- c(3, 3)
# Vector of successes and failures: sf
sf <- c(16, 4)
# Harry's shape parameters for his posterior: ab_new
ab_new <- ab + sf
# Graph Harry's posterior
TeachBayes::beta_draw(ab_new)
# Vector of beta parameters for Harry: ab
ab <- c(19, 7)
# Compute probability that P is smaller than 0.70
pbeta(0.7, ab[1], ab[2])
## [1] 0.3406549
# Show the area that is computed
TeachBayes::beta_area(0, 0.7, ab)
# Vector of beta parameters for Harry: ab
ab <- c(19, 7)
# Compute 90 percent interval
qbeta(c(0.05, 0.95), ab[1], ab[2])
## [1] 0.5804800 0.8605247
# Show the interval that is computed
TeachBayes::beta_interval(0.9, ab)
classical_binom_ci <-function(y, n, conf.level = 0.95){
s <- y + 2
f <- n - y + 2
n_new <- n + 4
phat <- s / n_new
se <- sqrt(phat * (1 - phat) / n_new)
z <- qnorm(1 - (1 - conf.level) / 2)
c(phat - z * se, phat + z * se)
}
# Define the number of successes and sample size: y, n
y <- 16
n <- 20
# Construct a 90 percent confidence interval
classical_binom_ci(y=y, n=n, conf.level=0.9)
## [1] 0.6046141 0.8953859
# Define the shape parameters for a uniform prior: ab
ab <- c(1, 1)
# Find the shape parameters of the posterior: ab_new
ab_new <- ab + c(y, n-y)
# Find a 90% Bayesian probability interval
TeachBayes::beta_interval(0.9, ab_new)
qbeta(c(0.05, 0.95), ab_new[1], ab_new[2])
## [1] 0.6155919 0.9011565
# Vector of beta parameters for Harry: ab
ab <- c(19, 7)
# Simulate 1000 draws from the beta posterior: p_sim
p_sim <- rbeta(1000, ab[1], ab[2])
# Construct a histogram of the simulated values
hist(p_sim, freq=FALSE)
# Compute the probability that P is larger than 0.7
mean(p_sim > 0.7)
## [1] 0.658
# Find a 90% probability interval
quantile(p_sim, c(0.05, 0.95))
## 5% 95%
## 0.5904463 0.8581961
# Vector of beta parameters for Harry: ab
ab <- c(19, 7)
# Simulate 1000 draws from the beta posterior: p_sim
p_sim <- rbeta(1000, ab[1], ab[2])
# Compute the odds-ratio: or_sim
or_sim <- p_sim / (1 - p_sim)
# Construct a histogram of the simulated values of or_sim
hist(or_sim, freq=FALSE)
# Find the probability the odds ratio is greater than 2
mean(or_sim > 2)
## [1] 0.778
# Find a 90% probability interval for the odds ratio
quantile(or_sim, c(0.05, 0.95))
## 5% 95%
## 1.390640 6.141282
Chapter 3 - Normal mean
Normal sampling model - Roger Federer “serving efficiency” examples:
Bayes with a continuous prior - same example assuming normal distribution with mean M and sd s:
Updating the normal prior - suppose a starting prior for 18 +/- 1.56 (Mo +/- So):
Simulation - can take the Posterior M and S and run simulations using rnorm:
Example code includes:
# Place possible values of M in a vector: Model
Model <- seq(250, 290, by = 10)
# Construct a uniform probability vector: Prior1
Prior1 <- rep(0.2, 5)
# Graph the prior using function prob_plot()
TeachBayes::prob_plot(data.frame(Model, Prior1))
# Construct a different probability distribution: Prior2
Prior2 <- c(0.3, 0.3, 0.2, 0.1, 0.1)
# Graph the prior using function prob_plot()
TeachBayes::prob_plot(data.frame(Model, Prior2))
# Define models and prior: M, Prior
M <- seq(250, 290, by = 10)
Prior <- rep(.2, 5)
# Collect observations
times <- c(240, 267, 308, 275, 271,
268, 258, 295, 315, 262)
# Compute ybar and standard error
ybar <- mean(times); n <- 10
sigma <- 20; se <- sigma / sqrt(n)
# Compute likelihoods using dnorm(): Likelihood
Likelihood <- dnorm(ybar, mean=M, sd=se)
# Collect the vectors M, Prior, Likelihood in a data frame: bayes_df
bayes_df <- data.frame(M, Prior, Likelihood)
# Use bayesian_crank to compute the posterior probabilities: bayes_df
bayes_df <- TeachBayes::bayesian_crank(bayes_df)
# Use prior_post_plot() to graph the prior and posterior probabilities
TeachBayes::prior_post_plot(bayes_df)
# Specify the 0.02 quantile of M: quantile1
quantile1 <- list(p=0.02, x=240)
# Specify the 0.60 quantile of M: quantile2
quantile2 <- list(p=0.6, x=280)
# Find the normal parameters that match the two quantiles
normal_par <- LearnBayes::normal.select(quantile1, quantile2)
# Plot the normal curve using the normal_draw() function
TeachBayes::normal_draw(normal_par)
# Collect observations
times <- c(240, 267, 308, 275, 271,
268, 258, 295, 315, 262)
# Compute ybar and standard error
ybar <- mean(times)
sigma <- 20; se <- sigma / sqrt(10)
# Define mean and standard error: Data
Data <- c(ybar, se)
# Define mean and standard deviation of prior: Prior
Prior <- c(260, 10)
# Use normal_update() function: Posterior
Posterior <- TeachBayes::normal_update(Prior, Data)
# Construct plot of prior and posterior
TeachBayes::many_normal_plots(list(Prior, Posterior))
# Define mean and standard error: Data
Data <- c(275.9, 6.32)
# Compute 90% confidence interval: C_Interval
C_Interval <- Data[1] + c(-1, 1) * 1.645 * Data[2]
# Find the length of the confidence interval
diff(C_Interval)
## [1] 20.7928
# Define mean and standard deviation of posterior: Posterior
Posterior <- c(271.35, 5.34)
# Display a 90% probability interval
TeachBayes::normal_interval(prob=0.90, Posterior)
# Compute the 90% probability interval: B_Interval
B_Interval <- qnorm(p=c(0.05, 0.95), mean=271.35, sd=5.34)
# Compute the length of the Bayesian interval
diff(B_Interval)
## [1] 17.56704
# Simulate 1000 values from the posterior curve: M_sim
M_sim <- rnorm(1000, 270.5, 5.8)
# Compute the posterior standard deviation
sd(M_sim)
## [1] 5.674759
# Compute the probability that M is smaller than 260
mean(M_sim < 260)
## [1] 0.029
# Find a 70 percent probability interval for M
quantile(M_sim, c(0.15, 0.85))
## 15% 85%
## 265.0575 276.5815
# Simulate 1000 draws from John's posterior density: M_sim
M_sim <- rnorm(1000, 270.5, 5.8)
# Simulate 1000 draws from the predictive density: y_sim
y_sim <- rnorm(1000, M_sim, 20)
# Compute the probability I score less than 250
mean(y_sim < 250)
## [1] 0.157
# Find a 90 percent prediction interval for my score
quantile(y_sim, c(0.05, 0.95))
## 5% 95%
## 239.5252 302.1135
Chapter 4 - Bayesian Comparisons
Comparing two proportions - multiple parameters rather than just a single proportion or a single mean:
Proportions with continuous priors - continuing with the exercise examples with pW and pM:
Normal model inference - modeling when both the mean M and the standard deviation S are unknown:
Bayesian regression - example of looking at “how much slower does Rafa serve than Roger”?
Example code includes:
# Define a uniform prior on all 25 pairs: prior
prior <- TeachBayes::testing_prior(0.1, 0.9, 5, uniform=TRUE)
# Display the prior matrix
prior
## 0.1 0.3 0.5 0.7 0.9
## 0.1 0.04 0.04 0.04 0.04 0.04
## 0.3 0.04 0.04 0.04 0.04 0.04
## 0.5 0.04 0.04 0.04 0.04 0.04
## 0.7 0.04 0.04 0.04 0.04 0.04
## 0.9 0.04 0.04 0.04 0.04 0.04
# Graph the prior
TeachBayes::draw_two_p(prior)
# Find the probability distribution of pN - pS: d_NS
d_NS <- TeachBayes::two_p_summarize(prior)
# Graph this distribution
TeachBayes::prob_plot(d_NS)
# Define a uniform prior on all 25 pairs: prior
prior <- TeachBayes::testing_prior(0.1, 0.9, 5, uniform = TRUE)
# Define the data: s1f1, s2f2
s1f1 <- c(12, 8)
s2f2 <- c(17, 3)
# Compute the posterior: post
post <- TeachBayes::two_p_update(prior, s1f1, s2f2)
# Graph the posterior
TeachBayes::draw_two_p(post)
# Find the probability distribution of pN - pS: d_NS
d_NS <- TeachBayes::two_p_summarize(post)
# Graph this distribution
TeachBayes::prob_plot(d_NS)
# Simulate 1000 values from the prior on pS: sim_pS
sim_pS <- rbeta(1000, 4.91, 3.38)
# Simulate 1000 values from the prior on pN: sim_pN
sim_pN <- rbeta(1000, 4.91, 3.38)
# For each pair of proportions, compute the difference: d_NS
d_NS <- sim_pN - sim_pS
# Plot a histogram of the values in d_NS
hist(d_NS)
# Find the probability d_NS is positive
mean(d_NS > 0)
## [1] 0.499
# Find a 90% probability interval for d_NS
quantile(d_NS, c(0.05, 0.95))
## 5% 95%
## -0.3920780 0.3739923
# Define the number of successes and number of failures: s1f1, s2f2
s1f1 <- c(12, 8)
s2f2 <- c(17, 3)
# Find the prior beta shape parameters for pS and pN:
pS_prior <- c(1, 1)
pN_prior <- c(1, 1)
# Find the posterior beta shape parameters for pS: pS_shape
pS_shape <- pS_prior + s1f1
# Find the posterior beta shape parameters for pN: pN_shape
pN_shape <- pN_prior + s2f2
# Simulate 1000 draws from the posterior: sim_pS, sim_pN
sim_pS <- rbeta(1000, pS_shape[1], pS_shape[2])
sim_pN <- rbeta(1000, pN_shape[1], pN_shape[2])
# Construct a scatterplot of the posterior
plot(sim_pS, sim_pN)
# Simulate 1000 draws from the posterior: sim_pS, sim_pN
sim_pS <- rbeta(1000, 13, 9)
sim_pN <- rbeta(1000, 18, 4)
# For each pair of proportions, compute the ratio: r_NS
r_NS <- sim_pN / sim_pS
# Plot a histogram of the values in r_NS
hist(r_NS)
# Find the probability r_NS is larger than 1
mean(r_NS > 1)
## [1] 0.96
# Find a 80% probability interval for r_NS
quantile(r_NS, c(0.1, 0.9))
## 10% 90%
## 1.094200 1.831035
# Collect reaction times: times
times <- c(240, 267, 308, 275, 271,
268, 258, 295, 315, 262)
# Fit a normal model: fit
fit <- lm(times ~ 1)
# Simulate 1000 from posterior: sim_fit
sim_fit <- arm::sim(fit, n.sims=1000)
# Extract the simulated values of M and S: M_sim, s_sim
M_sim <- coef(sim_fit)
S_sim <- arm::sigma.hat(sim_fit)
# Construct a scatterplot of simulated values
plot(M_sim, S_sim)
# Collect reaction times: times
times <- c(240, 267, 308, 275, 271,
268, 258, 295, 315, 262)
# Fit a normal model: fit
fit <- lm(times ~ 1)
# Simulate 1000 from posterior: sim_fit
sim_fit <- arm::sim(fit, n.sims = 1000)
# Extract the simulated values of M and S: M_sim, s_sim
M_sim <- coef(sim_fit)
s_sim <- arm::sigma.hat(sim_fit)
# Compute values of the 75th percentile: Q75
Q75 <- M_sim + 0.674 * s_sim
# Construct histogram of the posterior of Q75
hist(Q75)
# Find a 70% probability interval for Q75
quantile(Q75, c(0.15, 0.85))
## 15% 85%
## 284.1902 302.1866
ddTime <- c( 240, 267, 308, 275, 271, 268, 258, 295, 315, 262, 279, 241, 225, 252, 288, 242, 281, 254, 263, 276 )
ddPerson <- rep(c("Jim", "Steven"), each=10)
dd <- data.frame(Person=factor(ddPerson), Time=ddTime)
# Perform a regression fit of Time with Person as a covariate: fit
fit <- lm(Time ~ Person, data = dd)
# Simulate 1000 values from the posterior distribution: sim_fit
sim_fit <- arm::sim(fit, n.sims=1000)
# Extract simulated draws of beta and S: beta_sim, s_sim
beta_sim <- coef(sim_fit)
s_sim <- arm::sigma.hat(sim_fit)
# Construct a scatterplot of the posterior distribution of (beta0, beta1)
plot(beta_sim[, 1], beta_sim[, 2])
# Perform a regression fit of Time with Person as a covariate: fit
fit <- lm(Time ~ Person, data = dd)
# Simulate 1000 values from the posterior distribution: sim_fit
sim_fit <- arm::sim(fit, n.sims = 1000)
# Extract simulated draws of beta and S: beta_sim, s_sim
beta_sim <- coef(sim_fit)
s_sim <- arm::sigma.hat(sim_fit)
# Compute simulated values of the standardized change: s_delta
s_delta <- beta_sim[,2] / s_sim
# Find 90% interval estimate for s_delta
quantile(s_delta, c(0.05, 0.95))
## 5% 95%
## -1.478500882 -0.006343935
Chapter 1 - What is Machine Learning?
Machine learning is the process of constructing and using algorithms that learn from data:
Classification, Regression, Clustering are three common forms of machine learning problems:
Supervised vs Unsupervised Learning:
Example code includes:
data(iris, package="datasets")
# Reveal number of observations and variables in two different ways
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
dim(iris)
## [1] 150 5
# Show first and last observations in the iris data set
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
tail(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
# Summarize the iris data set
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
data(Wage, package="ISLR")
# Build Linear Model: lm_wage (coded already)
lm_wage <- lm(wage ~ age, data = Wage)
# Define data.frame: unseen (coded already)
unseen <- data.frame(age = 60)
# Predict the wage for a 60-year old worker
predict(lm_wage, unseen)
## 1
## 124.1413
emails <- data.frame(
avg_capital_seq=c( 1, 2.11, 4.12, 1.86, 2.97, 1.69, 5.891, 3.17, 1.23, 2.44, 3.56, 3.25, 1.33 ),
spam=as.integer(c( 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1 ))
)
str(emails)
## 'data.frame': 13 obs. of 2 variables:
## $ avg_capital_seq: num 1 2.11 4.12 1.86 2.97 ...
## $ spam : int 0 0 1 0 1 0 1 0 0 1 ...
# Show the dimensions of emails
dim(emails)
## [1] 13 2
# Inspect definition of spam_classifier()
spam_classifier <- function(x){
prediction <- rep(NA, length(x)) # initialize prediction vector
prediction[x > 4] <- 1
prediction[x >= 3 & x <= 4] <- 0
prediction[x >= 2.2 & x < 3] <- 1
prediction[x >= 1.4 & x < 2.2] <- 0
prediction[x > 1.25 & x < 1.4] <- 1
prediction[x <= 1.25] <- 0
return(prediction) # prediction is either 0 or 1
}
# Apply the classifier to the avg_capital_seq column: spam_pred
spam_pred <- spam_classifier(emails$avg_capital_seq)
# Compare spam_pred to emails$spam. Use ==
spam_pred == emails$spam
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
linkedin <- c( 5, 7, 4, 9, 11, 10, 14, 17, 13, 11, 18, 17, 21, 21, 24, 23, 28, 35, 21, 27, 23 )
# Create the days vector
days <- 1:length(linkedin)
# Fit a linear model called on the linkedin views per day: linkedin_lm
linkedin_lm <- lm(linkedin ~ days)
# Predict the number of views for the next three days: linkedin_pred
future_days <- data.frame(days = 22:24)
linkedin_pred <- predict(linkedin_lm, future_days)
# Plot historical data and predictions
plot(linkedin ~ days, xlim = c(1, 24))
points(22:24, linkedin_pred, col = "green")
# Chop up iris in my_iris and species
my_iris <- iris[-5]
species <- iris$Species
# Perform k-means clustering on my_iris: kmeans_iris
kmeans_iris <- kmeans(my_iris, 3)
# Compare the actual Species to the clustering using table()
table(kmeans_iris$cluster, species)
## species
## setosa versicolor virginica
## 1 0 48 14
## 2 0 2 36
## 3 50 0 0
# Plot Petal.Width against Petal.Length, coloring by cluster
plot(Petal.Length ~ Petal.Width, data = my_iris, col = kmeans_iris$cluster)
# Take a look at the iris dataset
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# A decision tree model has been built for you
tree <- rpart::rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,
data = iris, method = "class")
# A dataframe containing unseen observations
unseen <- data.frame(Sepal.Length = c(5.3, 7.2),
Sepal.Width = c(2.9, 3.9),
Petal.Length = c(1.7, 5.4),
Petal.Width = c(0.8, 2.3)
)
# Predict the label of the unseen observations. Print out the result.
predict(tree, unseen, type="class")
## 1 2
## setosa virginica
## Levels: setosa versicolor virginica
data(mtcars, package="datasets")
cars <- mtcars[,c("wt", "hp")]
str(cars)
## 'data.frame': 32 obs. of 2 variables:
## $ wt: num 2.62 2.88 2.32 3.21 3.44 ...
## $ hp: num 110 110 93 110 175 105 245 62 95 123 ...
# Explore the cars dataset
str(cars)
## 'data.frame': 32 obs. of 2 variables:
## $ wt: num 2.62 2.88 2.32 3.21 3.44 ...
## $ hp: num 110 110 93 110 175 105 245 62 95 123 ...
summary(cars)
## wt hp
## Min. :1.513 Min. : 52.0
## 1st Qu.:2.581 1st Qu.: 96.5
## Median :3.325 Median :123.0
## Mean :3.217 Mean :146.7
## 3rd Qu.:3.610 3rd Qu.:180.0
## Max. :5.424 Max. :335.0
# Group the dataset into two clusters: km_cars
km_cars <- kmeans(cars, 2)
# Print out the contents of each cluster
km_cars$cluster
## Mazda RX4 Mazda RX4 Wag Datsun 710
## 2 2 2
## Hornet 4 Drive Hornet Sportabout Valiant
## 2 1 2
## Duster 360 Merc 240D Merc 230
## 1 2 2
## Merc 280 Merc 280C Merc 450SE
## 2 2 1
## Merc 450SL Merc 450SLC Cadillac Fleetwood
## 1 1 1
## Lincoln Continental Chrysler Imperial Fiat 128
## 1 1 2
## Honda Civic Toyota Corolla Toyota Corona
## 2 2 2
## Dodge Challenger AMC Javelin Camaro Z28
## 2 2 1
## Pontiac Firebird Fiat X1-9 Porsche 914-2
## 1 2 2
## Lotus Europa Ford Pantera L Ferrari Dino
## 2 1 1
## Maserati Bora Volvo 142E
## 1 2
# Group the dataset into two clusters: km_cars
km_cars <- kmeans(cars, 2)
# Add code: color the points in the plot based on the clusters
plot(cars, col=km_cars$cluster)
# Print out the cluster centroids
km_cars$centers
## wt hp
## 1 3.984923 215.69231
## 2 2.692000 99.47368
# Replace the ___ part: add the centroids to the plot
points(km_cars$centers, pch = 22, bg = c(1, 2), cex = 2)
Chapter 2 - Performance Measures
Measuring model performance or error - is the model good?
Training set and test set - power is about the ability to make predictions about unseen data:
Bias and variance are the main error sources for a predictive model:
Example code includes:
data(titanic_train, package="titanic")
titanic <- titanic_train %>%
select(Survived, Pclass, Sex, Age) %>%
mutate(Survived=factor(Survived, levels=c(1, 0)), Sex=factor(Sex)) %>%
na.omit()
# Have a look at the structure of titanic
str(titanic)
## 'data.frame': 714 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 1 1 1 2 2 2 1 1 1 ...
## $ Pclass : int 3 1 3 1 3 1 3 3 2 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# A decision tree classification model is built on the data
tree <- rpart::rpart(Survived ~ ., data = titanic, method = "class")
# Use the predict() method to make predictions, assign to pred
pred <- predict(tree, titanic, type="class")
# Use the table() method to make the confusion matrix
(conf <- table(titanic$Survived, pred))
## pred
## 1 0
## 1 212 78
## 0 53 371
# Assign TP, FN, FP and TN using conf
TP <- conf[1, 1] # this will be 212
FN <- conf[1, 2] # this will be 78
FP <- conf[2, 1] # fill in
TN <- conf[2, 2] # fill in
# Calculate and print the accuracy: acc
(acc <- sum(TP, TN) / sum(conf))
## [1] 0.8165266
# Calculate and print out the precision: prec
(prec <- TP / (TP + FP))
## [1] 0.8
# Calculate and print out the recall: rec
(rec <- TP / (TP + FN))
## [1] 0.7310345
# DO NOT HAVE THIS DATASET
# Take a look at the structure of air
# str(air)
# Inspect your colleague's code to build the model
# fit <- lm(dec ~ freq + angle + ch_length, data = air)
# Use the model to predict for all values: pred
# pred <- predict(fit, air)
# Use air$dec and pred to calculate the RMSE
# rmse <- sqrt( mean((air$dec-pred)^2) )
# Print out rmse
# rmse
# Previous model
# fit <- lm(dec ~ freq + angle + ch_length, data = air)
# pred <- predict(fit)
# rmse <- sqrt(sum( (air$dec - pred) ^ 2) / nrow(air))
# rmse
# Your colleague's more complex model
# fit2 <- lm(dec ~ freq + angle + ch_length + velocity + thickness, data = air)
# Use the model to predict for all values: pred2
# pred2 <- predict(fit2)
# Calculate rmse2
# rmse2 <- sqrt(sum( (air$dec - pred2) ^ 2) / nrow(air))
# Print out rmse2
# rmse2
# ALSO DO NOT HAVE THIS DATASET, THOUGH IT IS AVAILABLE ON UCI
# Explore the structure of the dataset
seeds <- read.delim("seeds.txt", header=FALSE,
col.names=c("area", "perimeter", "compactness", "length",
"width", "asymmetry", "groove", "type"
)
)
str(seeds)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness: num 0.871 0.881 0.905 0.895 0.903 ...
## $ length : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymmetry : num 2.22 1.02 2.7 2.26 1.35 ...
## $ groove : num 5.22 4.96 4.83 4.8 5.17 ...
## $ type : int 1 1 1 1 1 1 1 1 1 1 ...
# Group the seeds in three clusters
km_seeds <- kmeans(seeds[,-8], 3)
# Color the points in the plot based on the clusters
plot(length ~ compactness, data = seeds, col=km_seeds$cluster)
# Print out the ratio of the WSS to the BSS
with(km_seeds, tot.withinss / betweenss)
## [1] 0.2762846
# Shuffle the dataset, call the result shuffled
n <- nrow(titanic)
shuffled <- titanic[sample(n),]
# Split the data in train and test
train_indices <- 1:round(0.7*n)
train <- shuffled[train_indices, ]
test <- shuffled[-train_indices, ]
# Print the structure of train and test
str(train)
## 'data.frame': 500 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 1 1 1 2 2 1 2 2 2 1 ...
## $ Pclass : int 1 3 1 3 3 1 2 3 2 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 2 2 1 2 ...
## $ Age : num 22 32 42 27 24 53 19 28 26 39 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
str(test)
## 'data.frame': 214 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 1 2 2 1 1 1 2 1 2 2 ...
## $ Pclass : int 2 3 3 2 2 2 3 2 2 1 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 1 2 1 2 2 ...
## $ Age : num 24 21 10 19 25 45 41 23 54 44 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# Fill in the model that has been learned.
tree <- rpart::rpart(Survived ~ ., data=train, method = "class")
# Predict the outcome on the test set with tree: pred
pred <- predict(tree, newdata=test, type="class")
# Calculate the confusion matrix: conf
(conf <- table(test$Survived, pred))
## pred
## 1 0
## 1 68 35
## 0 12 99
# Initialize the accs vector
accs <- rep(0,6)
for (i in 1:6) {
# These indices indicate the interval of the test set
indices <- (((i-1) * round((1/6)*nrow(shuffled))) + 1):((i*round((1/6) * nrow(shuffled))))
# Exclude them from the train set
train <- shuffled[-indices,]
# Include them in the test set
test <- shuffled[indices,]
# A model is learned using each training set
tree <- rpart::rpart(Survived ~ ., train, method = "class")
# Make a prediction on the test set using tree
pred <- predict(tree, newdata=test, type="class")
# Assign the confusion matrix to conf
conf <- table(test$Survived, pred)
# Assign the accuracy of this model to the ith index in accs
accs[i] <- sum(diag(conf))/sum(conf)
}
# Print out the mean of accs
mean(accs)
## [1] 0.7871148
data(spam, package="kernlab")
str(spam)
## 'data.frame': 4601 obs. of 58 variables:
## $ make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ num3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ num415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charSemicolon : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ charRoundbracket : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ charSquarebracket: num 0 0 0 0 0 0 0 0 0 0 ...
## $ charExclamation : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ charDollar : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ charHash : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capitalAve : num 3.76 5.11 9.82 3.54 3.54 ...
## $ capitalLong : num 61 101 485 40 40 15 4 11 445 43 ...
## $ capitalTotal : num 278 1028 2259 191 191 ...
## $ type : Factor w/ 2 levels "nonspam","spam": 2 2 2 2 2 2 2 2 2 2 ...
emails_full <- spam %>%
select(capitalAve, type) %>%
mutate(avg_capital_seq=capitalAve, spam=factor(as.integer(type)-1, levels=c(1, 0))) %>%
select(avg_capital_seq, spam)
str(emails_full)
## 'data.frame': 4601 obs. of 2 variables:
## $ avg_capital_seq: num 3.76 5.11 9.82 3.54 3.54 ...
## $ spam : Factor w/ 2 levels "1","0": 1 1 1 1 1 1 1 1 1 1 ...
# The spam filter that has been 'learned' for you
spam_classifier <- function(x){
prediction <- rep(NA, length(x)) # initialize prediction vector
prediction[x > 4] <- 1
prediction[x >= 3 & x <= 4] <- 0
prediction[x >= 2.2 & x < 3] <- 1
prediction[x >= 1.4 & x < 2.2] <- 0
prediction[x > 1.25 & x < 1.4] <- 1
prediction[x <= 1.25] <- 0
return(factor(prediction, levels = c("1", "0"))) # prediction is either 0 or 1
}
# Apply spam_classifier to emails_full: pred_full
pred_full <- spam_classifier(emails_full$avg_capital_seq)
# Build confusion matrix for emails_full: conf_full
conf_full <- table(emails_full$spam, pred_full)
# Calculate the accuracy with conf_full: acc_full
(acc_full <- sum(diag(conf_full)) / sum(conf_full))
## [1] 0.6561617
emails_small <- data.frame(avg_capital_seq=c( 1, 2.112, 4.123, 1.863, 2.973, 1.687, 5.891,
3.167, 1.23, 2.441, 3.555, 3.25, 1.333
),
spam=factor(c(0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1), levels=c(1, 0))
)
str(emails_small)
## 'data.frame': 13 obs. of 2 variables:
## $ avg_capital_seq: num 1 2.11 4.12 1.86 2.97 ...
## $ spam : Factor w/ 2 levels "1","0": 2 2 1 2 1 2 1 2 2 1 ...
spam_classifier <- function(x){
prediction <- rep(NA, length(x))
prediction[x > 4] <- 1
prediction[x <= 4] <- 0
return(factor(prediction, levels = c("1", "0")))
}
# conf_small and acc_small have been calculated for you
conf_small <- table(emails_small$spam, spam_classifier(emails_small$avg_capital_seq))
acc_small <- sum(diag(conf_small)) / sum(conf_small)
acc_small
## [1] 0.7692308
# Apply spam_classifier to emails_full and calculate the confusion matrix: conf_full
conf_full <- table(emails_full$spam, spam_classifier(emails_full$avg_capital_seq))
# Calculate acc_full
(acc_full <- sum(diag(conf_full)) / sum(conf_full))
## [1] 0.7259291
Chapter 3 - Classification
Decision trees - assign class to an unseen observation (each observation consists of a vector of features, and a classification):
K-nearest-neighbors (knn) - an example of “instance based learning”:
ROC curve - Receiver Operator Characteristic curve - is a powerful performance measure for binary classification:
Example code includes:
titanic <- titanic_train %>%
select(Survived, Pclass, Sex, Age) %>%
mutate(Survived=factor(Survived, levels=c(1, 0)), Sex=factor(Sex), Pclass=factor(Pclass)) %>%
na.omit()
trIdx <- sample(x=1:nrow(titanic), size=round(.7*nrow(titanic)), replace=FALSE)
train <- titanic[trIdx, ]
test <- titanic[-trIdx, ]
str(train); str(test)
## 'data.frame': 500 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 1 2 2 2 1 2 2 2 1 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 2 2 2 3 3 3 1 2 2 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 2 2 2 1 2 2 2 1 2 ...
## $ Age : num 40 59 31 4 27 25 42 34 27 42 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
## 'data.frame': 214 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 1 2 1 1 1 2 2 1 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 1 1 2 3 1 3 2 2 1 2 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 2 1 1 1 2 2 2 2 2 ...
## $ Age : num 38 54 14 4 58 39 35 34 40 66 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# Fill in the ___, build a tree model: tree
tree <- rpart::rpart(Survived ~ ., data=train, method="class")
# Draw the decision tree
rattle::fancyRpartPlot(tree)
# Predict the values of the test set: pred
pred <- predict(tree, newdata=test, type="class")
# Construct the confusion matrix: conf
(conf <- table(test$Survived, pred))
## pred
## 1 0
## 1 59 37
## 0 2 116
# Print out the accuracy
sum(diag(conf)) / sum(conf)
## [1] 0.817757
# Calculation of a complex tree
tree <- rpart::rpart(Survived ~ ., train, method = "class", control = rpart::rpart.control(cp=0.00001))
# Draw the complex tree
rattle::fancyRpartPlot(tree)
# Prune the tree: pruned
pruned <- rpart::prune(tree, cp=0.01)
# Draw pruned
rattle::fancyRpartPlot(pruned)
data(spam, package="kernlab")
spam <- spam %>%
mutate(spam=as.integer(type)-1L) %>%
select(-type)
str(spam)
## 'data.frame': 4601 obs. of 58 variables:
## $ make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ num3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ num415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charSemicolon : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ charRoundbracket : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ charSquarebracket: num 0 0 0 0 0 0 0 0 0 0 ...
## $ charExclamation : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ charDollar : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ charHash : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capitalAve : num 3.76 5.11 9.82 3.54 3.54 ...
## $ capitalLong : num 61 101 485 40 40 15 4 11 445 43 ...
## $ capitalTotal : num 278 1028 2259 191 191 ...
## $ spam : int 1 1 1 1 1 1 1 1 1 1 ...
idxTrain <- sample(x=1:nrow(spam), size=round(.7*nrow(spam)), replace=FALSE)
train <- spam[idxTrain, ]
test <- spam[-idxTrain, ]
dim(train); dim(test)
## [1] 3221 58
## [1] 1380 58
# Train and test tree with gini criterion
tree_g <- rpart::rpart(spam ~ ., train, method = "class")
pred_g <- predict(tree_g, test, type = "class")
conf_g <- table(test$spam, pred_g)
acc_g <- sum(diag(conf_g)) / sum(conf_g)
# Change the first line of code to use information gain as splitting criterion
tree_i <- rpart::rpart(spam ~ ., train, method = "class", parms = list(split = "information"))
pred_i <- predict(tree_i, test, type = "class")
conf_i <- table(test$spam, pred_i)
acc_i <- sum(diag(conf_i)) / sum(conf_i)
# Draw a fancy plot of both tree_g and tree_i
rattle::fancyRpartPlot(tree_g)
rattle::fancyRpartPlot(tree_i)
# Print out acc_g and acc_i
acc_g
## [1] 0.8992754
acc_i
## [1] 0.8992754
# Shuffle the dataset, call the result shuffled
titanic <- titanic_train %>%
select(Survived, Pclass, Sex, Age) %>%
mutate(Survived=factor(Survived, levels=c(1, 0)), Sex=as.integer(factor(Sex))-1L) %>%
na.omit()
n <- nrow(titanic)
shuffled <- titanic[sample(n),]
# Split the data in train and test
train_indices <- 1:round(0.7*n)
train <- shuffled[train_indices, ]
test <- shuffled[-train_indices, ]
# Print the structure of train and test
str(train)
## 'data.frame': 500 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 2 1 2 2 2 2 2 2 2 ...
## $ Pclass : int 1 3 2 2 3 3 2 3 3 3 ...
## $ Sex : int 1 1 0 1 1 1 0 0 1 1 ...
## $ Age : num 65 40.5 30 50 34 20 44 20 10 21 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
str(test)
## 'data.frame': 214 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 1 1 1 2 2 2 2 2 1 2 ...
## $ Pclass : int 1 1 3 1 3 3 3 3 1 1 ...
## $ Sex : int 0 0 1 0 1 1 1 1 0 1 ...
## $ Age : num 24 21 27 25 2 40 24 35 33 40 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# Store the Survived column of train and test in train_labels and test_labels
train_labels <- train$Survived
test_labels <- test$Survived
# Copy train and test to knn_train and knn_test
knn_train <- train
knn_test <- test
# Drop Survived column for knn_train and knn_test
knn_train$Survived <- NULL
knn_test$Survived <- NULL
# Normalize Pclass
min_class <- min(knn_train$Pclass)
max_class <- max(knn_train$Pclass)
knn_train$Pclass <- (knn_train$Pclass - min_class) / (max_class - min_class)
knn_test$Pclass <- (knn_test$Pclass - min_class) / (max_class - min_class)
# Normalize Age
min_age <- min(knn_train$Age)
max_age <- max(knn_train$Age)
knn_train$Age <- (knn_train$Age - min_age) / (max_age - min_age)
knn_test$Age <- (knn_test$Age - min_age) / (max_age - min_age)
# Fill in the ___, make predictions using knn: pred
pred <- class::knn(train = knn_train, test = knn_test, cl = train_labels, k = 5)
# Construct the confusion matrix: conf
(conf <- table(test_labels, pred))
## pred
## test_labels 1 0
## 1 67 22
## 0 19 106
range <- 1:round(0.2 * nrow(knn_train))
accs <- rep(0, length(range))
for (k in range) {
# Fill in the ___, make predictions using knn: pred
pred <- class::knn(knn_train, knn_test, cl=train_labels, k = k)
# Fill in the ___, construct the confusion matrix: conf
conf <- table(test_labels, pred)
# Fill in the ___, calculate the accuracy and store it in accs[k]
accs[k] <- sum(diag(conf)) / sum(conf)
}
# Plot the accuracies. Title of x-axis is "k".
plot(range, accs, xlab = "k")
# Calculate the best k
which.max(accs)
## [1] 4
# CAUTION - DO NOT HAVE THIS DATA, though UCIMLR (Census + Income) is the SOURCE
# test should be 9215 x 14 while train should be 21503 x 14
# income is the key variable, with 1 meaning > $50,000 while 0 meaning otherwise
# Build a tree on the training set: tree
# tree <- rpart::rpart(income ~ ., train, method = "class")
# Predict probability values using the model: all_probs
# all_probs <- predict(tree, newdata=test, type="prob")
# Print out all_probs
# str(all_probs)
# Select second column of all_probs: probs
# probs <- all_probs[, 2]
# Make a prediction object: pred
# pred <- ROCR::prediction(probs, test$income)
# Make a performance object: perf
# perf <- ROCR::performance(pred, "tpr", "fpr")
# Plot this curve
# plot(perf)
# Make a performance object: perf
# perf <- ROCR::performance(pred, "auc")
# Print out the AUC
# perf@y.values[[1]]
# EVEN MORE DATA THAT I DO NOT HAVE
draw_roc_lines <- function(tree, knn) {
if (!(class(tree)== "performance" && class(knn) == "performance") ||
!(attr(class(tree),"package") == "ROCR" && attr(class(knn),"package") == "ROCR")) {
stop("This predefined function needs two performance objects as arguments.")
} else if (length(tree@x.values) == 0 | length(knn@x.values) == 0) {
stop('This predefined function needs the right kind of performance objects as arguments. Are you sure you are creating both objects with arguments "tpr" and "fpr"?')
} else {
plot(0,0,
type = "n",
main = "ROC Curves",
ylab = "True positive rate",
xlab = "False positive rate",
ylim = c(0,1),
xlim = c(0,1))
lines(tree@x.values[[1]], tree@y.values[[1]], type = "l", lwd = 2, col = "red")
lines(knn@x.values[[1]], knn@y.values[[1]], type = "l", lwd = 2, col = "green")
legend("bottomright", c("DT","KNN"), lty=c(1,1),lwd=c(2.5,2.5),col=c("red","green"))
}
}
# Make the prediction objects for both models: pred_t, pred_k
# pred_t <- ROCR::prediction(probs_t, test$spam)
# pred_k <- ROCR::prediction(probs_k, test$spam)
# Make the performance objects for both models: perf_t, perf_k
# perf_t <- ROCR::performance(pred_t, "tpr", "fpr")
# perf_k <- ROCR::performance(pred_k, "tpr", "fpr")
# Draw the ROC lines using draw_roc_lines()
# draw_roc_lines(perf_t, perf_k)
Chapter 4 - Regression
Simple, Linear Regression - estimated an actual value rather than the class of an observation:
Multivariable Linear Regression - combining several predictors all in a single model:
k-Nearest-Neighbors and Generalization - solution to problem of not knowing what transformations to use:
Example code includes:
kang_nose <- data.frame(nose_width=c( 241, 222, 233, 207, 247, 189, 226, 240, 215, 231, 263, 220, 271, 284, 279, 272, 268, 278, 238, 255, 308, 281, 288, 306, 236, 204, 216, 225, 220, 219, 201, 213, 228, 234, 237, 217, 211, 238, 221, 281, 292, 251, 231, 275, 275 ) ,
nose_length=c( 609, 629, 620, 564, 645, 493, 606, 660, 630, 672, 778, 616, 727, 810, 778, 823, 755, 710, 701, 803, 855, 838, 830, 864, 635, 565, 562, 580, 596, 597, 636, 559, 615, 740, 677, 675, 629, 692, 710, 730, 763, 686, 717, 737, 816 )
)
str(kang_nose)
## 'data.frame': 45 obs. of 2 variables:
## $ nose_width : num 241 222 233 207 247 189 226 240 215 231 ...
## $ nose_length: num 609 629 620 564 645 493 606 660 630 672 ...
nose_width_new <- data.frame(nose_width=250)
# Plot nose length as function of nose width.
plot(kang_nose, xlab = "nose width", ylab = "nose length")
# Fill in the ___, describe the linear relationship between the two variables: lm_kang
lm_kang <- lm(nose_length ~ nose_width, data = kang_nose)
# Print the coefficients of lm_kang
lm_kang$coefficients
## (Intercept) nose_width
## 27.893058 2.701175
# Predict and print the nose length of the escaped kangoroo
predict(lm_kang, newdata=nose_width_new)
## 1
## 703.1869
# Build model and make plot
lm_kang <- lm(nose_length ~ nose_width, data=kang_nose)
plot(kang_nose, xlab = "nose width", ylab = "nose length")
abline(lm_kang$coefficients, col = "red")
# Apply predict() to lm_kang: nose_length_est
nose_length_est <- predict(lm_kang)
# Calculate difference between the predicted and the true values: res
res <- (kang_nose$nose_length - nose_length_est)
# Calculate RMSE, assign it to rmse and print it
(rmse <- sqrt( mean( res^2 ) ))
## [1] 43.26288
# Calculate the residual sum of squares: ss_res
ss_res <- sum(res^2)
# Determine the total sum of squares: ss_tot
ss_tot <- sum( (kang_nose$nose_length - mean(kang_nose$nose_length))^2 )
# Calculate R-squared and assign it to r_sq. Also print it.
(r_sq <- 1 - ss_res / ss_tot)
## [1] 0.7768914
# Apply summary() to lm_kang
summary(lm_kang)
##
## Call:
## lm(formula = nose_length ~ nose_width, data = kang_nose)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69.876 -32.912 -4.855 30.227 86.307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.8931 54.2991 0.514 0.61
## nose_width 2.7012 0.2207 12.236 1.34e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.26 on 43 degrees of freedom
## Multiple R-squared: 0.7769, Adjusted R-squared: 0.7717
## F-statistic: 149.7 on 1 and 43 DF, p-value: 1.342e-15
cgdp <- c( 666.3, 5935.7, 4619.2, 7574.3, 3646.7, 13961.2, 51127.1, 7884.2, 295.1, 47516.5, 825.2, 720, 1096.6, 7712.8, 22245.5, 4796.2, 8040, 11612.5, 15199.3, 40776.3, 7757, 378.6, 7593.9, 1426.4, 7720, 860.8, 3715.3, 10035.4, 27194.4, 47627.4, 7433.9, 60634.4, 6222.5, 6850.3, 39567.9, 590.2, 30262.2, 567.8, 36317.8, 1555, 49541.3, 4543.3, 1461.6, 550, 422.8, 585.6, 21682.6, 8299.1, 3703, 37896.8, 2346.7, 13507.4, 13902.7, 3514.6, 53313.6, 6432.8, 52111, 34960.3, 36194.4, 1269.1, 1084.4, 1604.4, 15209.9, 27970.5, 9127.3, 1707.5, 10139.2, 6575.4, 7437, 10125.6, 944.4, 648.1, 3631, 2032.8, 4301.1, 995.5, 16037.8, 3140, 2233.8, 449.4, 8624.8, 8518.7, 10361.3, 4731.6, 5370.7, 765.7, 1197.5, 4333.3, 7370.9, 4170.2, 1270.2, 10005.6, 253, 54198.7, 440.7, 3184.6, 1913.6, 97363.1, 698.3, 38400.1, 4749, 1333.5, 11770.9, 6594.4, 2843.1, 11879.7, 14422.8, 22080.9, 4479.1, 3575.2, 93397.1, 9996.7, 12735.9, 652.1, 1541.1, 25409, 1904.2, 1070.9, 2021.7, 3950.7, 6152.9, 1781.1, 1113.4, 1692.4, 18416.5, 23962.6, 58887.3, 2682.3, 15359.2, 1053.8, 646.1, 9031.5, 1280.4, 4106.4, 998.1, 677.4, 3082.5, 7986.9, 16810.9, 6477.9, 475.2, 1801.9 )
urb_pop <- c( 26.3, 43.3, 56.4, 57.6, 62.8, 24.2, 65.9, 54.4, 11.8, 97.8, 43.5, 29, 33.5, 73.6, 82.8, 39.6, 76.3, 85.4, 31.6, 76.9, 57.2, 39.8, 54.4, 53.8, 76.2, 28.2, 64.8, 75.9, 67, 75.1, 69.3, 87.5, 51.9, 59.9, 75.7, 22.2, 79.4, 19, 74.6, 40.7, 84.1, 53.4, 53.4, 36.7, 59, 48.5, 77.7, 35.6, 51.1, 80.6, 54.1, 58.7, 70.8, 53, 63, 69.4, 94, 68.8, 93, 35.6, 20.5, 44.2, 32, 82.4, 77.7, 37.6, 87.7, 78.4, 18.5, 79.5, 30.9, 29.6, 18.3, 38.6, 47, 26.8, 67.4, 59.7, 44.9, 34.5, 44.5, 64.1, 79, 49.1, 57, 39.1, 33.6, 60.4, 63.8, 71.2, 59.3, 39.8, 16.1, 81.5, 18.5, 46.9, 58.5, 80.2, 18.2, 80, 48.6, 38.3, 66.3, 78.3, 44.5, 86.5, 60.6, 62.9, 59.4, 37.2, 99.2, 54.4, 73.9, 27.8, 32.6, 82.9, 33.6, 43.4, 21.9, 66.3, 55.5, 37.2, 18.6, 64.5, 53.8, 49.7, 85.7, 21.3, 53.6, 22.3, 39.5, 49.7, 32.1, 23.6, 30.9, 15.8, 69.5, 61.8, 95.2, 64.3, 42, 40.5 )
world_bank_train <- data.frame(urb_pop=urb_pop, cgdp=cgdp)
str(world_bank_train)
## 'data.frame': 142 obs. of 2 variables:
## $ urb_pop: num 26.3 43.3 56.4 57.6 62.8 24.2 65.9 54.4 11.8 97.8 ...
## $ cgdp : num 666 5936 4619 7574 3647 ...
cgdp_afg <- data.frame(cgdp=413)
# Plot urb_pop as function of cgdp
with(world_bank_train, plot(y=urb_pop, x=cgdp))
# Set up a linear model between the two variables: lm_wb
lm_wb <- lm(urb_pop ~ cgdp, data=world_bank_train)
# Add a red regression line to your scatter plot
abline(lm_wb$coefficients, col="red")
# Summarize lm_wb and select R-squared
summary(lm_wb)$r.squared
## [1] 0.3822347
# Predict the urban population of afghanistan based on cgdp_afg
predict(lm_wb, newdata=cgdp_afg)
## 1
## 45.0156
# Plot: change the formula and xlab
plot(urb_pop ~ log(cgdp), data = world_bank_train,
xlab = "log(GDP per Capita)",
ylab = "Percentage of urban population")
# Linear model: change the formula
lm_wb <- lm(urb_pop ~ log(cgdp), data = world_bank_train)
# Add a red regression line to your scatter plot
abline(lm_wb$coefficients, col = "red")
# Summarize lm_wb and select R-squared
summary(lm_wb)$r.squared
## [1] 0.5788284
# Predict the urban population of afghanistan based on cgdp_afg
predict(lm_wb, newdata=cgdp_afg)
## 1
## 25.86829
sales <- c( 231, 156, 10, 519, 437, 487, 299, 195, 20, 68, 570, 428, 464, 15, 65, 98, 398, 161, 397, 497, 528, 99, 0.5, 347, 341, 507, 400 )
sq_ft <- c( 3, 2.2, 0.5, 5.5, 4.4, 4.8, 3.1, 2.5, 1.2, 0.6, 5.4, 4.2, 4.7, 0.6, 1.2, 1.6, 4.3, 2.6, 3.8, 5.3, 5.6, 0.8, 1.1, 3.6, 3.5, 5.1, 8.6 )
inv <- c( 294, 232, 149, 600, 567, 571, 512, 347, 212, 102, 788, 577, 535, 163, 168, 151, 342, 196, 453, 518, 615, 278, 142, 461, 382, 590, 517 )
ads <- c( 8.2, 6.9, 3, 12, 10.6, 11.8, 8.1, 7.7, 3.3, 4.9, 17.4, 10.5, 11.3, 2.5, 4.7, 4.6, 5.5, 7.2, 10.4, 11.5, 12.3, 2.8, 3.1, 9.6, 9.8, 12, 7 )
size_dist <- c( 8.2, 4.1, 4.3, 16.1, 14.1, 12.7, 10.1, 8.4, 2.1, 4.7, 12.3, 14, 15, 2.5, 3.3, 2.7, 16, 6.3, 13.9, 16.3, 16, 6.5, 1.6, 11.3, 11.5, 15.7, 12 )
comp <- c( 11, 12, 15, 1, 5, 4, 10, 12, 15, 8, 1, 7, 3, 14, 11, 10, 4, 13, 7, 1, 0, 14, 12, 6, 5, 0, 8 )
shop_data <- data.frame(sales=sales, sq_ft=sq_ft, inv=inv, ads=ads,
size_dist=size_dist, comp=comp
)
str(shop_data)
## 'data.frame': 27 obs. of 6 variables:
## $ sales : num 231 156 10 519 437 487 299 195 20 68 ...
## $ sq_ft : num 3 2.2 0.5 5.5 4.4 4.8 3.1 2.5 1.2 0.6 ...
## $ inv : num 294 232 149 600 567 571 512 347 212 102 ...
## $ ads : num 8.2 6.9 3 12 10.6 11.8 8.1 7.7 3.3 4.9 ...
## $ size_dist: num 8.2 4.1 4.3 16.1 14.1 12.7 10.1 8.4 2.1 4.7 ...
## $ comp : num 11 12 15 1 5 4 10 12 15 8 ...
shop_new <- data.frame(sq_ft=2.3, inv=420, ads=8.7, size_dist=9.1, comp=10)
# Add a plot: sales as a function of inventory. Is linearity plausible?
plot(sales ~ sq_ft, shop_data)
plot(sales ~ size_dist, shop_data)
plot(sales ~ inv, shop_data)
# Build a linear model for net sales based on all other variables: lm_shop
lm_shop <- lm(sales ~ ., data=shop_data)
# Summarize lm_shop
summary(lm_shop)
##
## Call:
## lm(formula = sales ~ ., data = shop_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.338 -9.699 -4.496 4.040 41.139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.85941 30.15023 -0.626 0.538372
## sq_ft 16.20157 3.54444 4.571 0.000166 ***
## inv 0.17464 0.05761 3.032 0.006347 **
## ads 11.52627 2.53210 4.552 0.000174 ***
## size_dist 13.58031 1.77046 7.671 1.61e-07 ***
## comp -5.31097 1.70543 -3.114 0.005249 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.65 on 21 degrees of freedom
## Multiple R-squared: 0.9932, Adjusted R-squared: 0.9916
## F-statistic: 611.6 on 5 and 21 DF, p-value: < 2.2e-16
# Plot the residuals in function of your fitted observations
plot(x=lm_shop$fitted.values, y=lm_shop$residuals)
# Make a Q-Q plot of your residual quantiles
qqnorm(lm_shop$residuals, ylab="Residual Quantiles")
# Summarize your model, are there any irrelevant predictors?
summary(lm_shop)
##
## Call:
## lm(formula = sales ~ ., data = shop_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.338 -9.699 -4.496 4.040 41.139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.85941 30.15023 -0.626 0.538372
## sq_ft 16.20157 3.54444 4.571 0.000166 ***
## inv 0.17464 0.05761 3.032 0.006347 **
## ads 11.52627 2.53210 4.552 0.000174 ***
## size_dist 13.58031 1.77046 7.671 1.61e-07 ***
## comp -5.31097 1.70543 -3.114 0.005249 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.65 on 21 degrees of freedom
## Multiple R-squared: 0.9932, Adjusted R-squared: 0.9916
## F-statistic: 611.6 on 5 and 21 DF, p-value: < 2.2e-16
# Predict the net sales based on shop_new.
predict(lm_shop, newdata=shop_new)
## 1
## 262.5006
choco_data <- data.frame(
energy=c( 1970, 2003, 2057, 1920, 2250, 2186, 1930, 1980, 1890, 2030, 2180, 1623, 1640, 2210, 1980, 1970, 1877.4, 2021.4, 1840.1, 2272.1, 2047.3, 1843, 2075.2, 2119.8, 2090.9, 1934.3, 2257.3, 2057.9, 1878.2, 1595.3, 2188.3, 1980.4, 1985.9, 2156.5, 2134.6, 2094.2, 2151.7, 2127.7, 2001.9, 1635.2, 2098.9, 1978.6, 1961.2, 1727.2, 1903.7, 2062.6, 2230.1, 1970.5, 2057.4, 1979.2, 1744.1, 1914.9, 1918.7, 1978.1, 2184, 2124.4 ),
protein=c( 3.1, 4.6, 9.9, 5.1, 10.2, 7, 3.5, 7.2, 4.7, 5.6, 5.5, 2.2, 3.7, 8.2, 8.5, 5, 6.1, 4.6, 3.4, 10.5, 5.9, 3.2, 5.6, 7.5, 7.3, 5.4, 8.9, 6, 2.8, 3.4, 5.5, 7, 7.7, 8.9, 9.4, 7.5, 10.4, 5.6, 9.1, 2.9, 9.1, 4.7, 2.2, 2.3, 6.3, 6.7, 8.3, 6.3, 5.3, 7.8, 5.8, 7, 4.3, 6.9, 8.9, 5 ),
fat=c( 27.2, 26.5, 23, 18.4, 30.1, 28.4, 24.5, 22.9, 19.5, 20.4, 26.8, 9.2, 12, 29.8, 20.6, 20, 18, 22.3, 20.8, 27.7, 25.7, 18.3, 27.6, 25.8, 26.9, 21.6, 29.4, 27.8, 21.4, 12.9, 32.1, 24.4, 19.6, 26.6, 24.5, 24.6, 27.2, 26.1, 21.8, 12.2, 25, 26.7, 22, 16.5, 21.5, 29.6, 28.1, 20.8, 28.1, 21.2, 15.4, 19.9, 18.9, 21.9, 30.5, 25.1 ),
size=c( 50, 50, 40, 80, 45, 78, 55, 60, 60, 50, 40, 55, 44.5, 75, 60, 42.5, 52.3, 52.3, 63.1, 64.8, 46.9, 45, 60.7, 66.3, 54.7, 66.2, 62.6, 48, 58.8, 37.5, 75.4, 80.8, 50.6, 43.3, 63.9, 54.4, 87.6, 55.9, 64.3, 52.8, 46.7, 57.7, 31.8, 72, 56.6, 83.9, 63.4, 46, 63.7, 43.2, 37.2, 58.5, 49, 55.2, 57.9, 48.8 )
)
str(choco_data)
## 'data.frame': 56 obs. of 4 variables:
## $ energy : num 1970 2003 2057 1920 2250 ...
## $ protein: num 3.1 4.6 9.9 5.1 10.2 7 3.5 7.2 4.7 5.6 ...
## $ fat : num 27.2 26.5 23 18.4 30.1 28.4 24.5 22.9 19.5 20.4 ...
## $ size : num 50 50 40 80 45 78 55 60 60 50 ...
# Add a plot: energy/100g as function of total size. Linearity plausible?
plot(energy ~ protein, choco_data)
plot(energy ~ fat, choco_data)
plot(energy ~ size, choco_data)
# Build a linear model for the energy based on all other variables: lm_choco
lm_choco <- lm(energy ~ ., data=choco_data)
# Plot the residuals in function of your fitted observations
plot(x=lm_choco$fitted.values, y=lm_choco$residuals)
# Make a Q-Q plot of your residual quantiles
qqnorm(lm_choco$residuals)
# Summarize lm_choco
summary(lm_choco)
##
## Call:
## lm(formula = energy ~ ., data = choco_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -106.680 -36.071 -9.062 36.079 104.361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1339.2806 40.0195 33.466 < 2e-16 ***
## protein 23.0122 3.6565 6.293 6.6e-08 ***
## fat 24.4416 1.6839 14.515 < 2e-16 ***
## size -0.8224 0.6026 -1.365 0.178
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.14 on 52 degrees of freedom
## Multiple R-squared: 0.9021, Adjusted R-squared: 0.8965
## F-statistic: 159.8 on 3 and 52 DF, p-value: < 2.2e-16
world_bank_test <- data.frame(
cgdp=c( 18389.4, 1099, 2379.2, 5823.3, 3670, 788.4, 1646.4, 19553.9, 1630.8, 61887, 2965.9, 3436.3, 12276.4, 3150.5, 42736.2, 16529.7, 10067.5, 25592.4, 50271.1, 5422.6, 6290.8, 20832, 10803.5, 935.9, 37031.7, 5292.9, 45603.3, 42522, 56286.8, 14520, 5361.1, 6662.6, 4017, 2037.7, 6075.5, 1784.4, 96443.7, 40169.6, 19719.8, 1796, 619, 10829.9, 16444.8, 14091.4, 54629.5, 5560.7, 43619.1, 19199.3, 832.9, 9463.1, 25198.1, 461, 5719.6, 3100.8, 10542.8, 12922.4, 1337.9, 51590, 914.7, 2052.3, 4173.4 ),
urb_pop=c( 39.8, 26.7, 37.9, 46.2, 53.5, 39.6, 53.5, 73, 32.4, 89.3, 75, 43.1, 53.3, 68.1, 79.3, 88.9, 86.9, 70.7, 81.7, 83.4, 63.5, 77.2, 53.5, 32.5, 92.1, 72.9, 82.3, 85.3, 100, 89.4, 70.1, 50.2, 28.5, 36.3, 78.1, 77.3, 100, 100, 67.6, 37.2, 31.9, 74, 66.5, 62.3, 81.4, 49.2, 80.7, 80.5, 57.4, 55.7, 88.7, 49.3, 45.7, 65, 72.9, 91.6, 25.2, 89.9, 34, 33, 19.3 )
)
str(world_bank_test)
## 'data.frame': 61 obs. of 2 variables:
## $ cgdp : num 18389 1099 2379 5823 3670 ...
## $ urb_pop: num 39.8 26.7 37.9 46.2 53.5 39.6 53.5 73 32.4 89.3 ...
# Build the log-linear model
lm_wb_log <- lm(urb_pop ~ log(cgdp), data = world_bank_train)
# Calculate rmse_train
rmse_train <- sqrt(mean(lm_wb_log$residuals ^ 2))
# The real percentage of urban population in the test set, the ground truth
world_bank_test_truth <- world_bank_test$urb_pop
# The predictions of the percentage of urban population in the test set
world_bank_test_input <- data.frame(cgdp = world_bank_test$cgdp)
world_bank_test_output <- predict(lm_wb_log, world_bank_test_input)
# The residuals: the difference between the ground truth and the predictions
res_test <- world_bank_test_output - world_bank_test_truth
# Use res_test to calculate rmse_test
rmse_test <- sqrt(mean(res_test^2))
# Print the ratio of the test RMSE over the training RMSE
rmse_test / rmse_train
## [1] 1.082428
my_knn <- function(x_pred, x, y, k){
m <- length(x_pred)
predict_knn <- rep(0, m)
for (i in 1:m) {
# Calculate the absolute distance between x_pred[i] and x
dist <- abs(x_pred[i] - x)
# Apply order() to dist, sort_index will contain
# the indices of elements in the dist vector, in
# ascending order. This means sort_index[1:k] will
# return the indices of the k-nearest neighbors.
sort_index <- order(dist)
# Apply mean() to the responses of the k-nearest neighbors
predict_knn[i] <- mean(y[sort_index[1:k]])
}
return(predict_knn)
}
# Apply your algorithm on the test set: test_output
test_output <- my_knn(x_pred=world_bank_test$cgdp, x=world_bank_train$cgdp,
y=world_bank_train$urb_pop, k=30
)
# Have a look at the plot of the output
plot(world_bank_train[,2:1],
xlab = "GDP per Capita",
ylab = "Percentage Urban Population")
points(world_bank_test$cgdp, test_output, col = "green")
# Set up a linear model between the two variables: lm_wb
lm_wb <- lm(urb_pop ~ cgdp, data=world_bank_train)
# Set up a linear model between the two variables: lm_wb
lm_wb_log <- lm(urb_pop ~ log(cgdp), data=world_bank_train)
# Define ranks to order the predictor variables in the test set
ranks <- order(world_bank_test$cgdp)
# Scatter plot of test set
plot(world_bank_test,
xlab = "GDP per Capita", ylab = "Percentage Urban Population")
# Predict with simple linear model and add line
test_output_lm <- predict(lm_wb, data.frame(cgdp = world_bank_test$cgdp))
lines(world_bank_test$cgdp[ranks], test_output_lm[ranks], lwd = 2, col = "blue")
# Predict with log-linear model and add line
test_output_lm_log <- predict(lm_wb_log, data.frame(cgdp = world_bank_test$cgdp))
lines(world_bank_test$cgdp[ranks], test_output_lm_log[ranks], lwd = 2, col = "red")
# Predict with k-NN and add line
test_output_knn <- my_knn(x_pred=world_bank_test$cgdp, x=world_bank_train$cgdp,
y=world_bank_train$urb_pop, k=30
)
lines(world_bank_test$cgdp[ranks], test_output_knn[ranks], lwd = 2, col = "green")
# Calculate RMSE on the test set for simple linear model
sqrt(mean( (test_output_lm - world_bank_test$urb_pop) ^ 2))
## [1] 17.41258
# Calculate RMSE on the test set for log-linear model
sqrt(mean( (test_output_lm_log - world_bank_test$urb_pop) ^ 2))
## [1] 15.01008
# Calculate RMSE on the test set for k-NN technique
sqrt(mean( (test_output_knn - world_bank_test$urb_pop) ^ 2))
## [1] 16.0917
Chapter 5 - Clustering
Clustering with k-means (unsupervised learning) - objects that are similar within and dissimilar across:
Performance and scaling issues - since there is no “truth”, the goal is to have compact clusters with low variance within the clusters and high separation between the clusters:
Hierarchical Clustering - addressing questions such as “which objects cluster first” and “which cluster pairs merge, and when”:
Example code includes:
seeds <- data.frame(area=c( 15.26, 14.88, 14.29, 13.84, 16.14, 14.38, 14.69, 14.11, 16.63, 16.44, 15.26, 14.03, 13.89, 13.78, 13.74, 14.59, 13.99, 15.69, 14.7, 12.72, 14.16, 14.11, 15.88, 12.08, 15.01, 16.19, 13.02, 12.74, 14.11, 13.45, 13.16, 15.49, 14.09, 13.94, 15.05, 16.12, 16.2, 17.08, 14.8, 14.28, 13.54, 13.5, 13.16, 15.5, 15.11, 13.8, 15.36, 14.99, 14.79, 14.86, 14.43, 15.78, 14.49, 14.33, 14.52, 15.03, 14.46, 14.92, 15.38, 12.11, 11.42, 11.23, 12.36, 13.22, 12.78, 12.88, 14.34, 14.01, 14.37, 12.73, 17.63, 16.84, 17.26, 19.11, 16.82, 16.77, 17.32, 20.71, 18.94, 17.12, 16.53, 18.72, 20.2, 19.57, 19.51, 18.27, 18.88, 18.98, 21.18, 20.88, 20.1, 18.76, 18.81, 18.59, 18.36, 16.87, 19.31, 18.98, 18.17, 18.72, 16.41, 17.99, 19.46, 19.18, 18.95, 18.83, 18.85, 17.63, 19.94, 18.55, 18.45, 19.38, 19.13, 19.14, 20.97, 19.06, 18.96, 19.15, 18.89, 20.03, 20.24, 18.14, 16.17, 18.43, 15.99, 18.75, 18.65, 17.98, 20.16, 17.55, 18.3, 18.94, 15.38, 16.16, 15.56, 15.38, 17.36, 15.57, 15.6, 16.23, 13.07, 13.32, 13.34, 12.22, 11.82, 11.21, 11.43, 12.49, 12.7, 10.79, 11.83, 12.01, 12.26, 11.18, 11.36, 11.19, 11.34, 12.13, 11.75, 11.49, 12.54, 12.02, 12.05, 12.55, 11.14, 12.1, 12.44, 12.15, 11.35, 11.24, 11.02, 11.55, 11.27, 11.4, 10.83, 10.8, 11.26, 10.74, 11.48, 12.21, 11.41, 12.46, 12.19, 11.65, 12.89, 11.56, 11.81, 10.91, 11.23, 10.59, 10.93, 11.27, 11.87, 10.82, 12.11, 12.8, 12.79, 13.37, 12.62, 12.76, 12.38, 12.67, 11.18, 12.7, 12.37, 12.19, 11.23, 13.2, 11.84, 12.3 ))
seeds$perimeter <- c( 14.84, 14.57, 14.09, 13.94, 14.99, 14.21, 14.49, 14.1, 15.46, 15.25, 14.85, 14.16, 14.02, 14.06, 14.05, 14.28, 13.83, 14.75, 14.21, 13.57, 14.4, 14.26, 14.9, 13.23, 14.76, 15.16, 13.76, 13.67, 14.18, 14.02, 13.82, 14.94, 14.41, 14.17, 14.68, 15, 15.27, 15.38, 14.52, 14.17, 13.85, 13.85, 13.55, 14.86, 14.54, 14.04, 14.76, 14.56, 14.52, 14.67, 14.4, 14.91, 14.61, 14.28, 14.6, 14.77, 14.35, 14.43, 14.77, 13.47, 12.86, 12.63, 13.19, 13.84, 13.57, 13.5, 14.37, 14.29, 14.39, 13.75, 15.98, 15.67, 15.73, 16.26, 15.51, 15.62, 15.91, 17.23, 16.49, 15.55, 15.34, 16.19, 16.89, 16.74, 16.71, 16.09, 16.26, 16.66, 17.21, 17.05, 16.99, 16.2, 16.29, 16.05, 16.52, 15.65, 16.59, 16.57, 16.26, 16.34, 15.25, 15.86, 16.5, 16.63, 16.42, 16.29, 16.17, 15.86, 16.92, 16.22, 16.12, 16.72, 16.31, 16.61, 17.25, 16.45, 16.2, 16.45, 16.23, 16.9, 16.91, 16.12, 15.38, 15.97, 14.89, 16.18, 16.41, 15.85, 17.03, 15.66, 15.89, 16.32, 14.9, 15.33, 14.89, 14.66, 15.76, 15.15, 15.11, 15.18, 13.92, 13.94, 13.95, 13.32, 13.4, 13.13, 13.13, 13.46, 13.71, 12.93, 13.23, 13.52, 13.6, 13.04, 13.05, 13.05, 12.87, 13.73, 13.52, 13.22, 13.67, 13.33, 13.41, 13.57, 12.79, 13.15, 13.59, 13.45, 13.12, 13, 13, 13.1, 12.97, 13.08, 12.96, 12.57, 13.01, 12.73, 13.05, 13.47, 12.95, 13.41, 13.36, 13.07, 13.77, 13.31, 13.45, 12.8, 12.82, 12.41, 12.8, 12.86, 13.02, 12.83, 13.27, 13.47, 13.53, 13.78, 13.67, 13.38, 13.44, 13.32, 12.72, 13.41, 13.47, 13.2, 12.88, 13.66, 13.21, 13.34 )
seeds$compactness <- c( 0.87, 0.88, 0.9, 0.9, 0.9, 0.9, 0.88, 0.89, 0.87, 0.89, 0.87, 0.88, 0.89, 0.88, 0.87, 0.9, 0.92, 0.91, 0.92, 0.87, 0.86, 0.87, 0.9, 0.87, 0.87, 0.88, 0.86, 0.86, 0.88, 0.86, 0.87, 0.87, 0.85, 0.87, 0.88, 0.9, 0.87, 0.91, 0.88, 0.89, 0.89, 0.89, 0.9, 0.88, 0.9, 0.88, 0.89, 0.89, 0.88, 0.87, 0.88, 0.89, 0.85, 0.88, 0.86, 0.87, 0.88, 0.9, 0.89, 0.84, 0.87, 0.88, 0.89, 0.87, 0.87, 0.89, 0.87, 0.86, 0.87, 0.85, 0.87, 0.86, 0.88, 0.91, 0.88, 0.86, 0.86, 0.88, 0.88, 0.89, 0.88, 0.9, 0.89, 0.88, 0.88, 0.89, 0.9, 0.86, 0.9, 0.9, 0.87, 0.9, 0.89, 0.91, 0.85, 0.86, 0.88, 0.87, 0.86, 0.88, 0.89, 0.9, 0.9, 0.87, 0.88, 0.89, 0.91, 0.88, 0.88, 0.89, 0.89, 0.87, 0.9, 0.87, 0.89, 0.89, 0.91, 0.89, 0.9, 0.88, 0.89, 0.88, 0.86, 0.91, 0.91, 0.9, 0.87, 0.9, 0.87, 0.9, 0.91, 0.89, 0.87, 0.86, 0.88, 0.9, 0.88, 0.85, 0.86, 0.88, 0.85, 0.86, 0.86, 0.87, 0.83, 0.82, 0.83, 0.87, 0.85, 0.81, 0.85, 0.82, 0.83, 0.83, 0.84, 0.83, 0.86, 0.81, 0.81, 0.83, 0.84, 0.85, 0.84, 0.86, 0.86, 0.88, 0.85, 0.84, 0.83, 0.84, 0.82, 0.85, 0.84, 0.84, 0.81, 0.86, 0.84, 0.83, 0.85, 0.85, 0.86, 0.87, 0.86, 0.86, 0.85, 0.82, 0.82, 0.84, 0.86, 0.86, 0.84, 0.86, 0.88, 0.83, 0.86, 0.89, 0.88, 0.88, 0.85, 0.9, 0.86, 0.9, 0.87, 0.89, 0.86, 0.88, 0.85, 0.89, 0.85, 0.87 )
seeds$length <- c( 5.76, 5.55, 5.29, 5.32, 5.66, 5.39, 5.56, 5.42, 6.05, 5.88, 5.71, 5.44, 5.44, 5.48, 5.48, 5.35, 5.12, 5.53, 5.21, 5.23, 5.66, 5.52, 5.62, 5.1, 5.79, 5.83, 5.39, 5.39, 5.54, 5.52, 5.45, 5.76, 5.72, 5.58, 5.71, 5.71, 5.83, 5.83, 5.66, 5.4, 5.35, 5.35, 5.14, 5.88, 5.58, 5.38, 5.7, 5.57, 5.54, 5.68, 5.58, 5.67, 5.71, 5.5, 5.74, 5.7, 5.39, 5.38, 5.66, 5.16, 5.01, 4.9, 5.08, 5.39, 5.26, 5.14, 5.63, 5.61, 5.57, 5.41, 6.19, 6, 5.98, 6.15, 6.02, 5.93, 6.06, 6.58, 6.45, 5.85, 5.88, 6.01, 6.29, 6.38, 6.37, 6.17, 6.08, 6.55, 6.57, 6.45, 6.58, 6.17, 6.27, 6.04, 6.67, 6.14, 6.34, 6.45, 6.27, 6.22, 5.72, 5.89, 6.11, 6.37, 6.25, 6.04, 6.15, 6.03, 6.67, 6.15, 6.11, 6.3, 6.18, 6.26, 6.56, 6.42, 6.05, 6.25, 6.23, 6.49, 6.32, 6.06, 5.76, 5.98, 5.36, 6.11, 6.29, 5.98, 6.51, 5.79, 5.98, 6.14, 5.88, 5.84, 5.78, 5.48, 6.14, 5.92, 5.83, 5.87, 5.47, 5.54, 5.39, 5.22, 5.31, 5.28, 5.18, 5.27, 5.39, 5.32, 5.26, 5.41, 5.41, 5.22, 5.17, 5.25, 5.05, 5.39, 5.44, 5.3, 5.45, 5.35, 5.27, 5.33, 5.01, 5.11, 5.32, 5.42, 5.18, 5.09, 5.33, 5.17, 5.09, 5.14, 5.28, 4.98, 5.19, 5.14, 5.18, 5.36, 5.09, 5.24, 5.24, 5.11, 5.5, 5.36, 5.41, 5.09, 5.09, 4.9, 5.05, 5.09, 5.13, 5.18, 5.24, 5.16, 5.22, 5.32, 5.41, 5.07, 5.22, 4.98, 5.01, 5.18, 5.2, 5.14, 5.14, 5.24, 5.17, 5.24 )
seeds$width <- c( 3.31, 3.33, 3.34, 3.38, 3.56, 3.31, 3.26, 3.3, 3.46, 3.5, 3.24, 3.2, 3.2, 3.16, 3.11, 3.33, 3.38, 3.51, 3.47, 3.05, 3.13, 3.17, 3.51, 2.94, 3.25, 3.42, 3.03, 2.96, 3.22, 3.06, 2.98, 3.37, 3.19, 3.15, 3.33, 3.48, 3.46, 3.68, 3.29, 3.3, 3.16, 3.16, 3.2, 3.4, 3.46, 3.15, 3.39, 3.38, 3.29, 3.26, 3.27, 3.43, 3.11, 3.2, 3.11, 3.21, 3.38, 3.41, 3.42, 3.03, 2.85, 2.88, 3.04, 3.07, 3.03, 3.12, 3.19, 3.16, 3.15, 2.88, 3.56, 3.48, 3.59, 3.93, 3.49, 3.44, 3.4, 3.81, 3.64, 3.57, 3.47, 3.86, 3.86, 3.77, 3.8, 3.65, 3.76, 3.67, 4.03, 4.03, 3.79, 3.8, 3.69, 3.86, 3.48, 3.46, 3.81, 3.55, 3.51, 3.68, 3.52, 3.69, 3.89, 3.68, 3.75, 3.79, 3.81, 3.57, 3.76, 3.67, 3.77, 3.79, 3.9, 3.74, 3.99, 3.72, 3.9, 3.82, 3.77, 3.86, 3.96, 3.56, 3.39, 3.77, 3.58, 3.87, 3.59, 3.69, 3.77, 3.69, 3.75, 3.83, 3.27, 3.4, 3.41, 3.46, 3.57, 3.23, 3.29, 3.47, 2.99, 3.07, 3.07, 2.97, 2.78, 2.69, 2.72, 2.97, 2.91, 2.65, 2.84, 2.78, 2.83, 2.69, 2.75, 2.67, 2.85, 2.75, 2.68, 2.69, 2.88, 2.81, 2.85, 2.97, 2.79, 2.94, 2.9, 2.84, 2.67, 2.71, 2.7, 2.85, 2.76, 2.76, 2.64, 2.82, 2.71, 2.64, 2.76, 2.89, 2.77, 3.02, 2.91, 2.85, 3.03, 2.68, 2.72, 2.67, 2.82, 2.79, 2.72, 2.8, 2.95, 2.63, 2.98, 3.13, 3.05, 3.13, 2.91, 3.15, 2.99, 3.13, 2.81, 3.09, 2.96, 2.98, 2.8, 3.23, 2.84, 2.97 )
seeds$asymmetry <- c( 2.22, 1.02, 2.7, 2.26, 1.36, 2.46, 3.59, 2.7, 2.04, 1.97, 4.54, 1.72, 3.99, 3.14, 2.93, 4.18, 5.23, 1.6, 1.77, 4.1, 3.07, 2.69, 0.77, 1.42, 1.79, 0.9, 3.37, 2.5, 2.75, 3.53, 0.86, 3.41, 3.92, 2.12, 2.13, 2.27, 2.82, 2.96, 3.11, 6.68, 2.59, 2.25, 2.46, 4.71, 3.13, 1.56, 1.37, 2.96, 2.7, 2.13, 3.98, 5.59, 4.12, 3.33, 1.48, 1.93, 2.8, 1.14, 2, 1.5, 2.7, 2.27, 3.22, 4.16, 1.18, 2.35, 1.31, 2.22, 1.46, 3.53, 4.08, 4.67, 4.54, 2.94, 4, 4.92, 3.82, 4.45, 5.06, 2.86, 5.53, 5.32, 5.17, 1.47, 2.96, 2.44, 1.65, 3.69, 5.78, 5.02, 1.96, 3.12, 3.24, 6, 4.93, 3.7, 3.48, 2.14, 2.85, 2.19, 4.22, 2.07, 4.31, 3.36, 3.37, 2.55, 2.84, 3.75, 3.25, 1.74, 2.23, 3.68, 2.11, 6.68, 4.68, 2.25, 4.33, 3.08, 3.64, 3.06, 5.9, 3.62, 4.29, 2.98, 3.34, 4.19, 4.39, 2.26, 1.91, 5.37, 2.84, 2.91, 4.46, 4.27, 4.97, 3.6, 3.53, 2.64, 2.73, 3.77, 5.3, 7.04, 6, 5.47, 4.47, 6.17, 2.22, 4.42, 3.26, 5.46, 5.2, 6.99, 4.76, 3.33, 4.05, 5.81, 3.35, 4.83, 4.38, 5.39, 3.08, 4.27, 4.99, 4.42, 6.39, 2.2, 4.92, 3.64, 4.34, 3.52, 6.74, 6.71, 4.31, 5.59, 5.18, 4.77, 5.34, 4.7, 5.88, 1.66, 4.96, 4.99, 4.86, 5.21, 6.18, 4.06, 4.9, 4.18, 7.52, 4.97, 5.4, 3.98, 3.6, 4.85, 4.13, 4.87, 5.48, 4.67, 3.31, 2.83, 5.47, 2.3, 4.05, 8.46, 3.92, 3.63, 4.33, 8.31, 3.6, 5.64 )
seeds$groove_length <- c( 5.22, 4.96, 4.83, 4.8, 5.17, 4.96, 5.22, 5, 5.88, 5.53, 5.31, 5, 4.74, 4.87, 4.83, 4.78, 4.78, 5.05, 4.65, 4.91, 5.18, 5.22, 5.09, 4.96, 5, 5.31, 4.83, 4.87, 5.04, 5.1, 5.06, 5.23, 5.3, 5.01, 5.36, 5.44, 5.53, 5.48, 5.31, 5, 5.18, 5.18, 4.78, 5.53, 5.18, 4.96, 5.13, 5.17, 5.11, 5.35, 5.14, 5.14, 5.4, 5.22, 5.49, 5.44, 5.04, 5.09, 5.22, 4.52, 4.61, 4.7, 4.61, 5.09, 4.78, 4.61, 5.15, 5.13, 5.3, 5.07, 6.06, 5.88, 5.79, 6.08, 5.84, 5.8, 5.92, 6.45, 6.36, 5.75, 5.88, 5.88, 6.19, 6.27, 6.18, 6.2, 6.11, 6.5, 6.23, 6.32, 6.45, 6.05, 6.05, 5.88, 6.45, 5.97, 6.24, 6.45, 6.27, 6.1, 5.62, 5.84, 6.01, 6.23, 6.15, 5.88, 6.2, 5.93, 6.55, 5.89, 5.79, 5.96, 5.92, 6.05, 6.32, 6.16, 5.75, 6.18, 5.97, 6.32, 6.19, 6.01, 5.7, 5.91, 5.14, 5.99, 6.1, 5.92, 6.18, 5.66, 5.96, 5.95, 5.8, 5.8, 5.85, 5.44, 5.97, 5.88, 5.75, 5.92, 5.39, 5.44, 5.31, 5.22, 5.18, 5.28, 5.13, 5, 5.32, 5.19, 5.31, 5.27, 5.36, 5, 5.26, 5.22, 5, 5.22, 5.31, 5.31, 5.49, 5.31, 5.05, 5.18, 5.05, 5.06, 5.27, 5.34, 5.13, 5.09, 5.16, 4.96, 5, 5.09, 5.18, 5.06, 5.09, 4.96, 5, 5.18, 4.83, 5.15, 5.16, 5.13, 5.32, 5.18, 5.35, 4.96, 4.96, 4.79, 5.04, 5, 5.13, 5.09, 5.01, 4.91, 4.96, 5.09, 5.23, 4.83, 5.04, 4.75, 4.83, 5, 5, 4.87, 5, 5.06, 5.04, 5.06 )
str(seeds)
## 'data.frame': 210 obs. of 7 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.87 0.88 0.9 0.9 0.9 0.9 0.88 0.89 0.87 0.89 ...
## $ length : num 5.76 5.55 5.29 5.32 5.66 5.39 5.56 5.42 6.05 5.88 ...
## $ width : num 3.31 3.33 3.34 3.38 3.56 3.31 3.26 3.3 3.46 3.5 ...
## $ asymmetry : num 2.22 1.02 2.7 2.26 1.36 2.46 3.59 2.7 2.04 1.97 ...
## $ groove_length: num 5.22 4.96 4.83 4.8 5.17 4.96 5.22 5 5.88 5.53 ...
seeds_type <- c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 )
# Do k-means clustering with three clusters, repeat 20 times: seeds_km
seeds_km <- kmeans(seeds, centers=3, nstart=20)
# Print out seeds_km
seeds_km
## K-means clustering with 3 clusters of sizes 61, 77, 72
##
## Cluster means:
## area perimeter compactness length width asymmetry groove_length
## 1 18.72180 16.29738 0.8855738 6.209016 3.721967 3.603607 6.065902
## 2 11.96442 13.27481 0.8529870 5.229481 2.872857 4.759870 5.088442
## 3 14.64847 14.46042 0.8794444 5.563333 3.277639 2.649306 5.192778
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3
## [36] 3 3 1 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 3 2
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 1 1 1 1 1 1 1 3 3 3 3 1 3 3 3
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 184.0488 195.7171 207.4138
## (between_SS / total_SS = 78.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Compare clusters with actual seed types. Set k-means clusters as rows
table(seeds_km$cluster, seeds_type)
## seeds_type
## 1 2 3
## 1 1 60 0
## 2 9 0 68
## 3 60 10 2
# Plot the length as function of width. Color by cluster
plot(x=seeds$width, y=seeds$length, col=seeds_km$cluster)
# Apply kmeans to seeds twice: seeds_km_1 and seeds_km_2
seeds_km_1 <- kmeans(seeds, centers=5, nstart=1)
seeds_km_2 <- kmeans(seeds, centers=5, nstart=1)
# Return the ratio of the within cluster sum of squares
seeds_km_1$tot.withinss / seeds_km_2$tot.withinss
## [1] 1.029281
# Compare the resulting clusters
table(seeds_km_1$cluster, seeds_km_2$cluster)
##
## 1 2 3 4 5
## 1 0 0 0 15 39
## 2 0 22 0 0 9
## 3 0 0 34 21 0
## 4 47 1 0 0 0
## 5 0 0 22 0 0
school_result <- data.frame(reading.4=c( 2.7, 3.9, 4.8, 3.1, 3.4, 3.1, 4.6, 3.1, 3.8, 5.2, 3.9, 4.1,
5.7, 3, 2.9, 3.4, 4, 3, 4, 3, 3.6, 3.1, 3.2, 3, 3.8
),
arithmetic.4=c( 3.2, 3.8, 4.1, 3.5, 3.7, 3.4, 4.4, 3.3, 3.7, 4.9, 3.8, 4,
5.1, 3.2, 3.3, 3.3, 4.2, 3, 4.1, 3.2, 3.6, 3.2, 3.3, 3.4, 4
),
reading.6=c( 4.5, 5.9, 6.8, 4.3, 5.1, 4.1, 6.6, 4, 4.7, 8.2, 5.2, 5.6, 7,
4.5, 4.5, 4.4, 5.2, 4.6, 5.9, 4.4, 5.3, 4.6, 5.4, 4.2, 6.9
),
arithmetic.6=c( 4.8, 6.2, 5.5, 4.6, 5.6, 4.7, 6.1, 4.9, 4.9, 6.9, 5.4, 5.6,
6.3, 5, 5.1, 5, 5.4, 5, 5.8, 5.1, 5.4, 5, 5.3, 4.7, 6.7
)
)
# Explore the structure of your data
str(school_result)
## 'data.frame': 25 obs. of 4 variables:
## $ reading.4 : num 2.7 3.9 4.8 3.1 3.4 3.1 4.6 3.1 3.8 5.2 ...
## $ arithmetic.4: num 3.2 3.8 4.1 3.5 3.7 3.4 4.4 3.3 3.7 4.9 ...
## $ reading.6 : num 4.5 5.9 6.8 4.3 5.1 4.1 6.6 4 4.7 8.2 ...
## $ arithmetic.6: num 4.8 6.2 5.5 4.6 5.6 4.7 6.1 4.9 4.9 6.9 ...
# Initialise ratio_ss
ratio_ss <- rep(0, 7)
# Finish the for-loop.
for (k in 1:7) {
# Apply k-means to school_result: school_km
school_km <- kmeans(school_result, centers=k, nstart=20)
# Save the ratio between of WSS to TSS in kth element of ratio_ss
ratio_ss[k] <- school_km$tot.withinss / school_km$totss
}
# Make a scree plot with type "b" and xlab "k"
plot(ratio_ss, type="b", xlab="k")
run_record <- data.frame(X100m=c( 10.23, 9.93, 10.15, 10.14, 10.27, 10, 9.84, 10.1, 10.17, 10.29, 10.97, 10.32, 10.24, 10.29, 10.16, 10.21, 10.02, 10.06, 9.87, 10.11, 10.32, 10.08, 10.33, 10.2, 10.35, 10.2, 10.01, 10, 10.28, 10.34, 10.6, 10.41, 10.3, 10.13, 10.21, 10.64, 10.19, 10.11, 10.08, 10.4, 10.57, 10, 9.86, 10.21, 10.11, 10.78, 10.37, 10.17, 10.18, 10.16, 10.36, 10.23, 10.38, 9.78 )
)
run_record$X200m <- c( 20.37, 20.06, 20.45, 20.19, 20.3, 19.89, 20.17, 20.15, 20.42, 20.85, 22.46, 20.96, 20.61, 20.52, 20.65, 20.47, 20.16, 20.23, 19.94, 19.85, 21.09, 20.11, 20.73, 20.93, 20.54, 20.89, 19.72, 20.03, 20.43, 20.41, 21.23, 20.77, 20.92, 20.06, 20.4, 21.52, 20.19, 20.42, 20.17, 21.18, 21.43, 19.98, 20.12, 20.75, 20.23, 21.86, 21.14, 20.59, 20.43, 20.41, 20.81, 20.69, 21.04, 19.32
)
run_record$X400m <- c( 46.18, 44.38, 45.8, 45.02, 45.26, 44.29, 44.72, 45.92, 45.25, 45.84, 51.4, 46.42, 45.77, 45.89, 44.9, 45.49, 44.64, 44.33, 44.36, 45.57, 48.44, 45.43, 45.48, 46.37, 45.58, 46.59, 45.26, 44.78, 44.18, 45.37, 46.95, 47.9, 46.41, 44.69, 44.31, 48.63, 45.68, 46.09, 46.11, 46.77, 45.57, 44.62, 46.11, 45.77, 44.6, 49.98, 47.6, 44.96, 45.54, 44.99, 46.72, 46.05, 46.63, 43.18
)
run_record$X800m <- c( 106.2, 104.4, 106.2, 103.8, 107.4, 102, 105, 105.6, 106.2, 108, 116.4, 112.2, 105, 101.4, 108.6, 104.4, 103.2, 103.8, 102, 105, 109.2, 105.6, 105.6, 109.8, 105, 108, 103.8, 106.2, 102, 104.4, 109.2, 105.6, 107.4, 108, 106.8, 108, 103.8, 104.4, 102.6, 108, 108, 103.2, 105, 105.6, 102.6, 116.4, 110.4, 103.8, 105.6, 102.6, 107.4, 108.6, 106.8, 102.6
)
run_record$X1500m <- c( 220.8, 211.8, 214.8, 214.2, 222, 214.2, 211.8, 219, 216.6, 223.2, 254.4, 230.4, 214.8, 211.2, 223.8, 216.6, 208.8, 211.8, 209.4, 216.6, 224.4, 215.4, 217.8, 226.2, 213.6, 222, 213, 217.2, 206.4, 218.4, 226.2, 220.2, 225.6, 229.8, 217.8, 228, 213, 212.4, 217.2, 240, 229.2, 215.4, 210, 214.2, 212.4, 240.6, 231.6, 208.8, 216.6, 211.8, 226.2, 226.2, 215.4, 207.6
)
run_record$X5000m <- c( 799.8, 775.8, 795.6, 769.8, 878.4, 808.8, 793.8, 803.4, 805.2, 809.4, 1002, 825, 805.2, 805.2, 858.6, 796.2, 778.8, 774.6, 780.6, 808.8, 838.8, 807, 810, 852.6, 784.2, 819.6, 785.4, 793.2, 759.6, 830.4, 834, 818.4, 846.6, 849, 787.8, 851.4, 793.2, 792.6, 786.6, 883.2, 838.2, 797.4, 783, 795, 792, 976.8, 897.6, 782.4, 797.4, 787.8, 834.6, 855, 807, 778.2
)
run_record$X10000m <- c( 1659, 1651.8, 1663.2, 1612.2, 1829.4, 1687.8, 1656, 1685.4, 1690.2, 1672.8, 2122.8, 1728.6, 1668, 1674.6, 1825.8, 1651.2, 1642.8, 1641.6, 1638, 1687.2, 1760.4, 1681.8, 1728.6, 1779, 1666.8, 1723.2, 1636.8, 1654.8, 1587.6, 1710.6, 1707, 1726.2, 1770, 1790.4, 1628.4, 1777.2, 1646.4, 1662, 1652.4, 1881.6, 1742.4, 1673.4, 1632.6, 1660.2, 1674, 2082.6, 1879.2, 1634.4, 1675.8, 1674, 1752, 1780.2, 1699.8, 1633.8
)
run_record$marathon <- c( 7774.2, 7650.6, 7933.2, 7632, 8782.2, 7563, 7805.4, 7931.4, 7750.8, 7870.2, 10275.6, 7993.8, 7894.2, 7765.8, 8760, 7869, 7581.6, 7708.2, 7627.8, 7922.4, 7951.8, 7926, 7920, 8350.8, 7749, 8052.6, 7637.4, 7569.6, 7473, 7632, 7755.6, 8041.8, 8956.2, 8584.2, 7631.4, 8374.2, 7698.6, 7715.4, 7810.2, 8887.8, 8306.4, 7753.8, 7581.6, 7938, 7749.6, 9690, 8653.2, 7633.8, 7822.8, 7773.6, 8061, 8359.8, 7815, 7522.8
)
rownames(run_record) <- c( 'Argentina', 'Australia', 'Austria', 'Belgium', 'Bermuda', 'Brazil', 'Canada', 'Chile', 'China', 'Columbia', 'CookIslands', 'CostaRica', 'CzechRepublic', 'Denmark', 'DominicanRepub', 'Finland', 'France', 'Germany', 'GreatBritain', 'Greece', 'Guatemala', 'Hungary', 'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan', 'Kenya', 'Korea,South', 'Korea,North', 'Luxembourg', 'Malaysia', 'Mauritius', 'Mexico', 'Myanmar(Burma)', 'Netherlands', 'NewZealand', 'Norway', 'PapuaNewGuinea', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Russia', 'Samoa', 'Singapore', 'Spain', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'U.S.A.'
)
# Explore your data with str() and summary()
str(run_record)
## 'data.frame': 54 obs. of 8 variables:
## $ X100m : num 10.23 9.93 10.15 10.14 10.27 ...
## $ X200m : num 20.4 20.1 20.4 20.2 20.3 ...
## $ X400m : num 46.2 44.4 45.8 45 45.3 ...
## $ X800m : num 106 104 106 104 107 ...
## $ X1500m : num 221 212 215 214 222 ...
## $ X5000m : num 800 776 796 770 878 ...
## $ X10000m : num 1659 1652 1663 1612 1829 ...
## $ marathon: num 7774 7651 7933 7632 8782 ...
summary(run_record)
## X100m X200m X400m X800m
## Min. : 9.78 Min. :19.32 Min. :43.18 Min. :101.4
## 1st Qu.:10.10 1st Qu.:20.17 1st Qu.:44.91 1st Qu.:103.8
## Median :10.20 Median :20.43 Median :45.58 Median :105.6
## Mean :10.22 Mean :20.54 Mean :45.83 Mean :106.1
## 3rd Qu.:10.32 3rd Qu.:20.84 3rd Qu.:46.32 3rd Qu.:108.0
## Max. :10.97 Max. :22.46 Max. :51.40 Max. :116.4
## X1500m X5000m X10000m marathon
## Min. :206.4 Min. : 759.6 Min. :1588 Min. : 7473
## 1st Qu.:213.0 1st Qu.: 788.9 1st Qu.:1653 1st Qu.: 7701
## Median :216.6 Median : 805.2 Median :1675 Median : 7819
## Mean :219.2 Mean : 817.1 Mean :1712 Mean : 8009
## 3rd Qu.:224.2 3rd Qu.: 834.5 3rd Qu.:1739 3rd Qu.: 8050
## Max. :254.4 Max. :1002.0 Max. :2123 Max. :10276
# Cluster run_record using k-means: run_km. 5 clusters, repeat 20 times
run_km <- kmeans(run_record, centers=5, nstart=20)
# Plot the 100m as function of the marathon. Color using clusters
plot(x=run_record$marathon, y=run_record$X100m, col=run_km$cluster)
# Calculate Dunn's index: dunn_km. Print it.
(dunn_km <- clValid::dunn(clusters=run_km$cluster, Data=run_record))
## [1] 0.05651773
# Standardize run_record, transform to a dataframe: run_record_sc
run_record_sc <- as.data.frame( scale(run_record) )
# Cluster run_record_sc using k-means: run_km_sc. 5 groups, let R start over 20 times
run_km_sc <- kmeans(run_record_sc, centers=5, nstart=20)
# Plot records on 100m as function of the marathon. Color using the clusters in run_km_sc
plot(x=run_record$marathon, y=run_record$X100m, col=run_km_sc$cluster,
xlab="Marathon", ylab="100 metres"
)
# Compare the resulting clusters in a nice table
table(run_km$cluster, run_km_sc$cluster)
##
## 1 2 3 4 5
## 1 0 0 2 0 2
## 2 2 0 0 0 0
## 3 0 0 6 0 0
## 4 0 13 0 10 1
## 5 0 1 0 10 7
# Calculate Dunn's index: dunn_km_sc. Print it.
(dunn_km_sc <- clValid::dunn(clusters=run_km_sc$cluster, Data=run_record_sc))
## [1] 0.1453556
# Apply dist() to run_record_sc: run_dist
run_dist <- dist(run_record_sc)
# Apply hclust() to run_dist: run_single
run_single <- hclust(run_dist, method="single")
# Apply cutree() to run_single: memb_single
memb_single <- cutree(run_single, k=5)
# Apply plot() on run_single to draw the dendrogram
plot(run_single)
# Apply rect.hclust() on run_single to draw the boxes
rect.hclust(run_single, k=5, border=2:6)
# Apply hclust() to run_dist: run_complete
run_complete <- hclust(run_dist, method="complete")
# Apply cutree() to run_complete: memb_complete
memb_complete <- cutree(run_complete, k=5)
# Apply plot() on run_complete to draw the dendrogram
plot(run_complete)
# Apply rect.hclust() on run_complete to draw the boxes
rect.hclust(run_complete, k=5, border=2:6)
# table() the clusters memb_single and memb_complete. Put memb_single in the rows
table(memb_single, memb_complete)
## memb_complete
## memb_single 1 2 3 4 5
## 1 27 7 14 0 1
## 2 0 0 0 1 0
## 3 0 0 0 0 1
## 4 0 0 0 0 2
## 5 0 0 0 1 0
# Dunn's index for k-means: dunn_km
dunn_km <- clValid::dunn(clusters=run_km_sc$cluster, Data=run_record_sc)
# Dunn's index for single-linkage: dunn_single
dunn_single <- clValid::dunn(clusters=memb_single, Data=run_record_sc)
# Dunn's index for complete-linkage: dunn_complete
dunn_complete <- clValid::dunn(clusters=memb_complete, Data=run_record_sc)
# Compare k-means with single-linkage
table(run_km_sc$cluster, memb_single)
## memb_single
## 1 2 3 4 5
## 1 0 1 0 0 1
## 2 14 0 0 0 0
## 3 6 0 0 2 0
## 4 20 0 0 0 0
## 5 9 0 1 0 0
# Compare k-means with complete-linkage
table(run_km_sc$cluster, memb_complete)
## memb_complete
## 1 2 3 4 5
## 1 0 0 0 2 0
## 2 7 7 0 0 0
## 3 0 0 6 0 2
## 4 20 0 0 0 0
## 5 0 0 8 0 2
crime_data <- data.frame(murder=c( 13.2, 10, 8.1, 8.8, 9, 7.9, 3.3, 5.9, 15.4, 17.4, 5.3, 2.6, 10.4, 7.2, 2.2, 6, 9.7, 15.4, 2.1, 11.3, 4.4, 12.1, 2.7, 16.1, 9, 6, 4.3, 12.2, 2.1, 7.4, 11.4, 11.1, 13, 0.8, 7.3, 6.6, 4.9, 6.3, 3.4, 14.4, 3.8, 13.2, 12.7, 3.2, 2.2, 8.5, 4, 5.7, 2.6, 6.8 )
)
crime_data$assault <- c( 236, 263, 294, 190, 276, 204, 110, 238, 335, 211, 46, 120, 249, 113, 56, 115, 109, 249, 83, 300, 149, 255, 72, 259, 178, 109, 102, 252, 57, 159, 285, 254, 337, 45, 120, 151, 159, 106, 174, 279, 86, 188, 201, 120, 48, 156, 145, 81, 53, 161
)
crime_data$urb_pop <- c( 58, 48, 80, 50, 91, 78, 77, 72, 80, 60, 83, 54, 83, 65, 57, 66, 52, 66, 51, 67, 85, 74, 66, 44, 70, 53, 62, 81, 56, 89, 70, 86, 45, 44, 75, 68, 67, 72, 87, 48, 45, 59, 80, 80, 32, 63, 73, 39, 66, 60
)
crime_data$rape <- c( 21.2, 44.5, 31, 19.5, 40.6, 38.7, 11.1, 15.8, 31.9, 25.8, 20.2, 14.2, 24, 21, 11.3, 18, 16.3, 22.2, 7.8, 27.8, 16.3, 35.1, 14.9, 17.1, 28.2, 16.4, 16.5, 46, 9.5, 18.8, 32.1, 26.1, 16.1, 7.3, 21.4, 20, 29.3, 14.9, 8.3, 22.5, 12.8, 26.9, 25.5, 22.9, 11.2, 20.7, 26.2, 9.3, 10.8, 15.6
)
rownames(crime_data) <- c( 'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
)
str(crime_data)
## 'data.frame': 50 obs. of 4 variables:
## $ murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ assault: num 236 263 294 190 276 204 110 238 335 211 ...
## $ urb_pop: num 58 48 80 50 91 78 77 72 80 60 ...
## $ rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
# Scale the dataset: crime_data_sc
crime_data_sc <- as.data.frame(scale(crime_data))
# Perform k-means clustering: crime_km
crime_km <- kmeans(crime_data_sc, centers=4, nstart=20)
# Perform single-linkage hierarchical clustering
## Calculate the distance matrix: dist_matrix
dist_matrix <- dist(crime_data_sc)
## Calculate the clusters using hclust(): crime_single
crime_single <- hclust(dist_matrix, method="single")
## Cut the clusters using cutree: memb_single
memb_single <- cutree(crime_single, k=4)
# Calculate the Dunn's index for both clusterings: dunn_km, dunn_single
dunn_km <- clValid::dunn(clusters=crime_km$cluster, Data=crime_data_sc)
dunn_single <- clValid::dunn(clusters=memb_single, Data=crime_data_sc)
# Print out the results
dunn_km
## [1] 0.1604403
dunn_single
## [1] 0.2438734
table(crime_km$cluster, memb_single)
## memb_single
## 1 2 3 4
## 1 13 0 0 0
## 2 8 0 0 0
## 3 16 0 0 0
## 4 9 1 2 1
Chapter 1 - Unsupervised Learning in R
Introduction to the main types of machine learning:
Introduction to k-means clustering - assume a number of sub-groups, then iteratively assign/update the clusters/centroids:
How kmeans works and practical matters:
Introduction to the Pokemon data - 800 Pokemon each with 6 features:
Example code includes:
x <- matrix(data=NA, nrow=300, ncol=2)
x[,1] <- c( 3.37, 1.44, 2.36, 2.63, 2.4, 1.89, 3.51, 1.91, 4.02, 1.94, 3.3, 4.29, 0.61, 1.72, 1.87, 2.64, 1.72, -0.66, -0.44, 3.32, 1.69, 0.22, 1.83, 3.21, 3.9, 1.57, 1.74, 0.24, 2.46, 1.36, 2.46, 2.7, 3.04, 1.39, 2.5, 0.28, 1.22, 1.15, -0.41, 2.04, 2.21, 1.64, 2.76, 1.27, 0.63, 2.43, 1.19, 3.44, 1.57, 2.66, 2.32, 1.22, 3.58, 2.64, 2.09, 2.28, 2.68, 2.09, -0.99, 2.28, 1.63, 2.19, 2.58, 3.4, 1.27, 3.3, 2.34, 3.04, 2.92, 2.72, 0.96, 1.91, 2.62, 1.05, 1.46, 2.58, 2.77, 2.46, 1.11, 0.9, 3.51, 2.26, 2.09, 1.88, 0.81, 2.61, 1.78, 1.82, 2.93, 2.82, 3.39, 1.52, 2.65, 3.39, 0.89, 1.14, 0.87, 0.54, 2.08, 2.65, -3.8, -3.96, -6, -3.15, -5.67, -4.89, -5.42, -5.12, -4.81, -4.88, -5.03, -4.89, -5.49, -5.5, -6.66, -5.38, -5.51, -2.3, -6.36, -4.86, -6.49, -6.47, -4.88, -6, -5, -5.43, -5.61, -7.02, -6.22, -4.82, -4.43, -5.49, -5, -3.88, -3.56, -6.1, -5.12, -3.8, -5.47, -5.05, -5.09, -5.89, -5.44, -5.03, -5.41, -3.89, -5.48, -5.43, -4.3, -6.06, -5.04, -6.55, -3.83, -5.27, -5.47, -6.24, -5.01, -5.8, -5.53, -3.71, -5.18, -6.07, -4.84, -5.36, -4.41, -3.57, -5.99, -4.55, -4.92, -4.1, -5.23, -4.16, -6.75, -3.31, -4.14, -5.15, -6.45, -4.36, -4.52, -5.01, -4.85, -5.58, -4.63, -4.71, -5.28, -6.34, -4.3, -4.45, -5.84, -6.59, -4.8, -5.35, -4.75, -6.29, -5.96, -3.91, -4.6, -4.41, -3.18, -4.87, -7, -4.67, -3.83, -2.94, -6.38, -6.15, -5.71, -6.05, -5.65, -5.19, -6.2, -2.96, -4.89, -5.08, -4.5, -4.96, -5.13, -3.52, -5.22, -6.28, -4.61, -5.35, -5.52, -6.07, -4.57, -5.17, -4.48, -5.23, -5.66, -3.75, -5.27, -4.05, -6.2, -5.47, -5.27, -5.39, -3.65, -5.02, -4.76, -5.94, -5.73, -4, -3.74, -3.75, -6.38, -2.95, -3.98, -5.03, -4.3, -5.97, -0.1, 1.05, -0.2, 1.19, 2.3, -0.03, 0.26, 1.05, -0.02, 0.62, 1.87, 1.97, 1.38, -0.85, 0.95, 2.06, 1.81, 0.81, -1.7, 1.06, 1.57, 1.05, 1.16, 1.43, 0.6, 2.31, 1.47, -0.24, 2.38, 2.2, 1.82, -0.66, 0.43, 1.64, 1.04, 1.35, 3.46, 0.18, -1.11, 1.27, 0.31, 1.45, 0.19, 3.21, 0.88, 0.52, 0.83, 1.86, 1.1, -0.63 )
x[,2] <- c( 2, 2.76, 2.04, 2.74, 1.85, 1.94, 2.48, 2.99, 0.75, 1.97, 1.93, 1.24, 0.97, 1.37, 2.59, 1.58, 1.22, 2.16, 0.76, 3.05, 1.52, 2.19, 2.05, 2, 3.81, 1.17, 3.15, 2.03, 1.16, 1.93, 2.75, 1.57, 1.23, 2.15, 2.99, 1.93, 0.61, 0.69, 1.23, 1.47, 1.98, 2.67, 1.57, 0.89, 2.61, 2.28, 3.16, 0.32, 2.09, 3.35, 2.72, 1.17, 2.73, 1.13, 1.55, 3.19, 1.71, 2.83, 1.71, 0.42, 1.15, 0.91, 1.52, 1.66, 1.85, 1.76, 3.89, 0.61, 1.59, 2.35, 3.63, 2.09, 3.24, 0.36, 3.45, 1.31, 1.72, 0.89, 2.13, 3.79, 4.42, 0.92, 2.49, 3.39, 1.8, 1.78, 1.7, 2.6, 3.4, 2.69, 2.32, 1.7, 2.5, 1.45, 1.72, 3.1, 2.44, 2.24, 1.74, 2.93, 3.33, 1.13, 2.06, 2.05, 1.42, 1, 2, 2.66, 3.48, 0.09, 1.3, 1.69, 0.34, 1.25, 1.22, 1.28, -0.19, 2.21, 1.37, 3.52, 2.8, 0.55, 2.1, 1.41, 2.89, 2.05, 1.44, 2.44, 2.15, 1.84, 4.02, 1.47, 1.53, 0.45, 1.96, 2.89, -0.07, 1.75, 0.82, 3.44, 3.36, 2.33, 3.43, 1.13, 2.95, 1.41, 2.32, 1.7, 1.72, 2.55, 0.7, 1.75, 2.17, 1.6, 2.1, 1.68, 3.62, 2.71, 4.97, 1.2, 2.81, 4.1, 2.3, 0.92, 0.99, 1.96, 3.31, 2.75, -0.14, 1.3, 1.99, 0.54, 2.69, -0.46, 2.14, 1.61, 1.51, 1.72, 2.31, 2.4, 1.77, 0.08, 0.56, 0.53, 2.76, 1.76, 2.27, 0.44, 1.46, 2.56, 1.82, 1.88, 1.93, 3.21, 1.39, 2.68, 2.9, 0.81, 2.12, 1.99, 3.03, 2.91, 2, 2.14, 1.28, 1.8, 0.97, 1.03, 0.78, 2.84, 3.11, 1.59, 0.87, 1.91, 4.24, 4.04, 0.28, 1.64, 3.53, 1.96, 3.6, 1.67, 2.6, 2.22, 5.23, 2.92, 0.79, 1.4, 2.37, 0.1, 0.2, 0.88, 1.65, 3.24, 1.73, 2.16, 1.94, 1.29, 3.36, 0.9, 1.77, 1.65, 2.53, 3.61, 2.51, 3.38, 2.76, 1.38, 2.08, 3.38, -1.56, 0.32, -0.16, 0.88, 0.75, 0.3, 1.49, -1.53, 0.91, -1.58, 0.59, 0.09, 0.97, 0.08, -1.57, -2.01, 0.54, -0.07, -0.57, -0.31, -0.67, -0.16, -0.93, -1.98, -0.22, 1.05, 1.88, 0, -0.08, 0.96, 0.05, -0.43, -1.74, -1.26, 0.41, -1.46, 1.05, -1.35, -0.19, 0, -0.01, 0.15, 0.6, -0.13, -0.25, 0.16, -0.43, 1.54, -2.17, 1.03 )
str(x)
## num [1:300, 1:2] 3.37 1.44 2.36 2.63 2.4 1.89 3.51 1.91 4.02 1.94 ...
# Create the k-means model: km.out
km.out <- kmeans(x, centers=3, nstart=20)
# Inspect the result
summary(km.out)
## Length Class Mode
## cluster 300 -none- numeric
## centers 6 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
# Print the cluster membership component of the model
km.out$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 3 3 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [211] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [246] 2 2 2 2 2 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 3 3 1
## [281] 3 3 3 3 3 3 1 3 3 3 3 3 3 1 3 3 3 1 3 3
# Print the km.out object
km.out
## K-means clustering with 3 clusters of sizes 98, 150, 52
##
## Cluster means:
## [,1] [,2]
## 1 2.2170408 2.05153061
## 2 -5.0554667 1.96973333
## 3 0.6642308 -0.09115385
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 3 3 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [211] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [246] 2 2 2 2 2 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 3 3 1
## [281] 3 3 3 3 3 3 1 3 3 3 3 3 3 1 3 3 3 1 3 3
##
## Within cluster sum of squares by cluster:
## [1] 148.7013 295.1237 95.4708
## (between_SS / total_SS = 87.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Scatter plot of x
plot(x, col=km.out$cluster, main="k-means with 3 clusters", xlab="", ylab="")
# Set up 2 x 3 plotting grid
par(mfrow = c(2, 3))
for(i in 1:6) {
# Run kmeans() on x with three clusters and one start
km.out <- kmeans(x, centers=3, nstart=1)
# Plot clusters
plot(x, col = km.out$cluster,
main = km.out$tot.withinss,
xlab = "", ylab = "")
}
par(mfrow = c(1, 1))
# Initialize total within sum of squares error: wss
wss <- 0
# For 1 to 15 cluster centers
for (i in 1:15) {
km.out <- kmeans(x, centers = i, nstart=20)
# Save total within sum of squares to wss variable
wss[i] <- km.out$tot.withinss
}
# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
pokemon <- matrix(data=NA, nrow=800, ncol=6)
v1 <- c(45, 60, 80, 80, 39, 58, 78, 78, 78, 44, 59, 79, 79, 45, 50, 60, 40, 45, 65, 65, 40, 63, 83, 83, 30, 55, 40, 65, 35, 60, 35, 60, 50, 75, 55, 70, 90, 46, 61, 81, 70, 95, 38, 73, 115, 140, 40, 75, 45, 60, 75, 35, 60, 60, 70, 10, 35, 40, 65, 50, 80, 40, 65, 55, 90, 40, 65, 90, 25, 40, 55, 55, 70, 80, 90, 50, 65, 80, 40, 80, 40, 55, 80, 50, 65, 90, 95, 95, 25, 50, 52, 35, 60, 65, 90, 80, 105, 30, 50, 30, 45, 60, 60, 35, 60, 85, 30, 55, 40, 60, 60, 95, 50, 60, 50, 50, 90, 40, 65, 80, 105, 250, 65, 105, 105, 30, 55, 45, 80, 30, 60, 40, 70, 65, 65, 65, 65, 65, 75, 20, 95, 95, 130, 48, 55, 130, 65, 65, 65, 35, 70, 30, 60, 80, 80, 160, 90, 90, 90, 41, 61, 91, 106, 106, 106, 100, 45, 60, 80, 39, 58, 78, 50, 65, 85, 35, 85, 60, 100, 40, 55, 40, 70, 85, 75, 125, 20, 50, 90, 35, 55, 40, 65, 55, 70, 90, 90, 75, 70, 100, 70, 90, 35, 55, 75, 55, 30, 75, 65, 55, 95, 65, 95, 60, 95, 60, 48, 190, 70, 50, 75, 100, 65, 75, 75, 60, 90, 65, 70, 70, 20, 80, 80, 55, 60, 90, 40, 50, 50, 100, 55, 35, 75, 45, 65, 65, 45, 75, 75, 75, 90, 90, 85, 73, 55, 35, 50, 45, 45, 45, 95, 255, 90, 115, 100, 50, 70, 100, 100, 106, 106, 100, 40, 50, 70, 70, 45, 60, 80, 80, 50, 70, 100, 100, 35, 70, 38, 78, 45, 50, 60, 50, 60, 40, 60, 80, 40, 70, 90, 40, 60, 40, 60, 28, 38, 68, 68, 40, 70, 60, 60, 60, 80, 150, 31, 61, 1, 64, 84, 104, 72, 144, 50, 30, 50, 70, 50, 50, 50, 50, 50, 60, 70, 70, 30, 60, 60, 40, 70, 70, 60, 60, 65, 65, 50, 70, 100, 45, 70, 70, 130, 170, 60, 70, 70, 70, 60, 80, 60, 45, 50, 80, 50, 70, 45, 75, 75, 73, 73, 70, 70, 50, 110, 43, 63, 40, 60, 66, 86, 45, 75, 20, 95, 70, 60, 44, 64, 64, 20, 40, 99, 65, 65, 65, 95, 50, 80, 80, 70, 90, 110, 35, 55, 55, 100, 43, 45, 65, 95, 95, 40, 60, 80, 80, 80, 80, 80, 80, 80, 80, 80, 100, 100, 100, 100, 105, 105, 100, 50, 50, 50, 50, 55, 75, 95, 44, 64, 76, 53, 64, 84, 40, 55, 85, 59, 79, 37, 77, 45, 60, 80, 40, 60, 67, 97, 30, 60, 40, 60, 60, 60, 70, 30, 70, 60, 55, 85, 45, 70, 76, 111, 75, 90, 150, 55, 65, 65, 60, 100, 49, 71, 45, 63, 103, 57, 67, 50, 20, 100, 76, 50, 58, 68, 108, 108, 135, 40, 70, 70, 68, 108, 40, 70, 48, 83, 74, 49, 69, 45, 60, 90, 90, 70, 70, 110, 115, 100, 75, 75, 85, 86, 65, 65, 75, 110, 85, 68, 68, 60, 45, 70, 50, 50, 50, 50, 50, 50, 75, 80, 75, 100, 90, 91, 110, 150, 150, 120, 80, 100, 70, 100, 100, 120, 100, 45, 60, 75, 65, 90, 110, 55, 75, 95, 45, 60, 45, 65, 85, 41, 64, 50, 75, 50, 75, 50, 75, 76, 116, 50, 62, 80, 45, 75, 55, 70, 85, 55, 67, 60, 110, 103, 103, 75, 85, 105, 50, 75, 105, 120, 75, 45, 55, 75, 30, 40, 60, 40, 60, 45, 70, 70, 50, 60, 95, 70, 105, 105, 75, 50, 70, 50, 65, 72, 38, 58, 54, 74, 55, 75, 50, 80, 40, 60, 55, 75, 45, 60, 70, 45, 65, 110, 62, 75, 36, 51, 71, 60, 80, 55, 50, 70, 69, 114, 55, 100, 165, 50, 70, 44, 74, 40, 60, 60)
pokemon[, 1] <- c( v1, 35, 65, 85, 55, 75, 50, 60, 60, 46, 66, 76, 55, 95, 70, 50, 80, 109, 45, 65, 77, 59, 89, 45, 65, 95, 70, 100, 70, 110, 85, 58, 52, 72, 92, 55, 85, 91, 91, 91, 79, 79, 79, 79, 100, 100, 89, 89, 125, 125, 125, 91, 91, 100, 100, 71, 56, 61, 88, 40, 59, 75, 41, 54, 72, 38, 85, 45, 62, 78, 38, 45, 80, 62, 86, 44, 54, 78, 66, 123, 67, 95, 75, 62, 74, 74, 45, 59, 60, 60, 78, 101, 62, 82, 53, 86, 42, 72, 50, 65, 50, 71, 44, 62, 58, 82, 77, 123, 95, 78, 67, 50, 45, 68, 90, 57, 43, 85, 49, 44, 54, 59, 65, 55, 75, 85, 55, 95, 40, 85, 126, 126, 108, 50, 50, 80, 80, 80 )
v2 <- c(49, 62, 82, 100, 52, 64, 84, 130, 104, 48, 63, 83, 103, 30, 20, 45, 35, 25, 90, 150, 45, 60, 80, 80, 56, 81, 60, 90, 60, 85, 55, 90, 75, 100, 47, 62, 92, 57, 72, 102, 45, 70, 41, 76, 45, 70, 45, 80, 50, 65, 80, 70, 95, 55, 65, 55, 80, 45, 70, 52, 82, 80, 105, 70, 110, 50, 65, 95, 20, 35, 50, 50, 80, 100, 130, 75, 90, 105, 40, 70, 80, 95, 120, 85, 100, 65, 75, 75, 35, 60, 65, 85, 110, 45, 70, 80, 105, 65, 95, 35, 50, 65, 65, 45, 48, 73, 105, 130, 30, 50, 40, 95, 50, 80, 120, 105, 55, 65, 90, 85, 130, 5, 55, 95, 125, 40, 65, 67, 92, 45, 75, 45, 110, 50, 83, 95, 125, 155, 100, 10, 125, 155, 85, 48, 55, 65, 65, 130, 60, 40, 60, 80, 115, 105, 135, 110, 85, 90, 100, 64, 84, 134, 110, 190, 150, 100, 49, 62, 82, 52, 64, 84, 65, 80, 105, 46, 76, 30, 50, 20, 35, 60, 90, 90, 38, 58, 40, 25, 30, 20, 40, 50, 75, 40, 55, 75, 95, 80, 20, 50, 100, 75, 35, 45, 55, 70, 30, 75, 65, 45, 85, 65, 65, 85, 75, 60, 72, 33, 80, 65, 90, 70, 75, 85, 125, 80, 120, 95, 130, 150, 10, 125, 185, 95, 80, 130, 40, 50, 50, 100, 55, 65, 105, 55, 40, 80, 60, 90, 90, 95, 60, 120, 80, 95, 20, 35, 95, 30, 63, 75, 80, 10, 85, 115, 75, 64, 84, 134, 164, 90, 130, 100, 45, 65, 85, 110, 60, 85, 120, 160, 70, 85, 110, 150, 55, 90, 30, 70, 45, 35, 70, 35, 50, 30, 50, 70, 40, 70, 100, 55, 85, 30, 50, 25, 35, 65, 85, 30, 60, 40, 130, 60, 80, 160, 45, 90, 90, 51, 71, 91, 60, 120, 20, 45, 45, 65, 75, 85, 85, 105, 70, 90, 110, 140, 40, 60, 100, 45, 75, 75, 50, 40, 73, 47, 60, 43, 73, 90, 120, 140, 70, 90, 60, 100, 120, 85, 25, 45, 60, 100, 70, 100, 85, 115, 40, 70, 110, 115, 100, 55, 95, 48, 78, 80, 120, 40, 70, 41, 81, 95, 125, 15, 60, 70, 90, 75, 115, 165, 40, 70, 68, 50, 130, 150, 23, 50, 80, 120, 40, 60, 80, 64, 104, 84, 90, 30, 75, 95, 135, 145, 55, 75, 135, 145, 100, 50, 75, 80, 100, 90, 130, 100, 150, 150, 180, 150, 180, 100, 150, 180, 70, 95, 68, 89, 109, 58, 78, 104, 51, 66, 86, 55, 75, 120, 45, 85, 25, 85, 65, 85, 120, 30, 70, 125, 165, 42, 52, 29, 59, 79, 69, 94, 30, 80, 45, 65, 105, 35, 60, 48, 83, 100, 50, 80, 66, 76, 136, 60, 125, 55, 82, 30, 63, 93, 24, 89, 80, 25, 5, 65, 92, 70, 90, 130, 170, 85, 70, 110, 145, 72, 112, 50, 90, 61, 106, 100, 49, 69, 20, 62, 92, 132, 120, 70, 85, 140, 100, 123, 95, 50, 76, 110, 60, 95, 130, 80, 125, 165, 55, 100, 80, 50, 65, 65, 65, 65, 65, 75, 105, 125, 120, 120, 90, 160, 100, 120, 70, 80, 100, 90, 100, 103, 120, 100, 45, 60, 75, 63, 93, 123, 55, 75, 100, 55, 85, 60, 80, 110, 50, 88, 53, 98, 53, 98, 53, 98, 25, 55, 55, 77, 115, 60, 100, 75, 105, 135, 45, 57, 85, 135, 60, 60, 80, 105, 140, 50, 65, 95, 100, 125, 53, 63, 103, 45, 55, 100, 27, 67, 35, 60, 92, 72, 82, 117, 90, 140, 30, 86, 65, 95, 75, 90, 58, 30, 50, 78, 108, 112, 140, 50, 95, 65, 105, 50, 95, 30, 45, 55, 30, 40, 65, 44, 87, 50, 65, 95, 60, 100, 75, 75, 135, 55, 85, 40, 60, 75, 47, 77, 50, 94, 55, 80, 100)
pokemon[,2] <- c( v2, 55, 85, 115, 55, 75, 30, 40, 55, 87, 117, 147, 70, 110, 50, 40, 70, 66, 85, 125, 120, 74, 124, 85, 125, 110, 83, 123, 55, 65, 97, 109, 65, 85, 105, 85, 60, 90, 129, 90, 115, 100, 115, 105, 120, 150, 125, 145, 130, 170, 120, 72, 72, 77, 128, 120, 61, 78, 107, 45, 59, 69, 56, 63, 95, 36, 56, 50, 73, 81, 35, 22, 52, 50, 68, 38, 45, 65, 65, 100, 82, 124, 80, 48, 48, 48, 80, 110, 150, 50, 52, 72, 48, 80, 54, 92, 52, 105, 60, 75, 53, 73, 38, 55, 89, 121, 59, 77, 65, 92, 58, 50, 50, 75, 100, 80, 70, 110, 66, 66, 66, 66, 90, 85, 95, 100, 69, 117, 30, 70, 131, 131, 100, 100, 160, 110, 160, 110 )
v3 <- c(49, 63, 83, 123, 43, 58, 78, 111, 78, 65, 80, 100, 120, 35, 55, 50, 30, 50, 40, 40, 40, 55, 75, 80, 35, 60, 30, 65, 44, 69, 40, 55, 85, 110, 52, 67, 87, 40, 57, 77, 48, 73, 40, 75, 20, 45, 35, 70, 55, 70, 85, 55, 80, 50, 60, 25, 50, 35, 60, 48, 78, 35, 60, 45, 80, 40, 65, 95, 15, 30, 45, 65, 50, 70, 80, 35, 50, 65, 35, 65, 100, 115, 130, 55, 70, 65, 110, 180, 70, 95, 55, 45, 70, 55, 80, 50, 75, 100, 180, 30, 45, 60, 80, 160, 45, 70, 90, 115, 50, 70, 80, 85, 95, 110, 53, 79, 75, 95, 120, 95, 120, 5, 115, 80, 100, 70, 95, 60, 65, 55, 85, 65, 80, 35, 57, 57, 100, 120, 95, 55, 79, 109, 80, 48, 50, 60, 60, 60, 70, 100, 125, 90, 105, 65, 85, 65, 100, 85, 90, 45, 65, 95, 90, 100, 70, 100, 65, 80, 100, 43, 58, 78, 64, 80, 100, 34, 64, 30, 50, 30, 50, 40, 70, 80, 38, 58, 15, 28, 15, 65, 85, 45, 70, 40, 55, 85, 105, 95, 50, 80, 115, 75, 40, 50, 70, 55, 30, 55, 45, 45, 85, 60, 110, 42, 80, 60, 48, 58, 65, 90, 140, 70, 105, 200, 230, 50, 75, 75, 100, 140, 230, 75, 115, 55, 50, 75, 40, 120, 40, 80, 85, 35, 75, 45, 70, 140, 30, 50, 90, 95, 60, 120, 90, 62, 35, 35, 95, 15, 37, 37, 105, 10, 75, 85, 115, 50, 70, 110, 150, 130, 90, 100, 35, 45, 65, 75, 40, 60, 70, 80, 50, 70, 90, 110, 35, 70, 41, 61, 35, 55, 50, 55, 70, 30, 50, 70, 50, 40, 60, 30, 60, 30, 100, 25, 35, 65, 65, 32, 62, 60, 80, 60, 80, 100, 90, 45, 45, 23, 43, 63, 30, 60, 40, 135, 45, 65, 75, 125, 85, 125, 100, 140, 180, 230, 55, 75, 85, 40, 60, 80, 40, 50, 55, 55, 45, 53, 83, 20, 40, 70, 35, 45, 40, 70, 100, 140, 35, 65, 60, 45, 50, 80, 40, 60, 60, 90, 110, 60, 60, 65, 85, 43, 73, 65, 85, 55, 105, 77, 97, 50, 100, 20, 79, 70, 70, 35, 65, 75, 90, 130, 83, 70, 60, 60, 48, 50, 80, 80, 50, 70, 90, 85, 105, 105, 130, 55, 60, 100, 80, 130, 80, 100, 130, 150, 200, 100, 150, 90, 120, 80, 100, 90, 90, 140, 160, 90, 100, 100, 50, 20, 160, 90, 64, 85, 105, 44, 52, 71, 53, 68, 88, 30, 50, 70, 40, 60, 41, 51, 34, 49, 79, 35, 65, 40, 60, 118, 168, 45, 85, 105, 95, 50, 42, 102, 70, 35, 55, 45, 70, 48, 68, 66, 34, 44, 44, 84, 94, 60, 52, 42, 64, 50, 47, 67, 86, 116, 95, 45, 5, 45, 108, 45, 65, 95, 115, 40, 40, 70, 88, 78, 118, 90, 110, 40, 65, 72, 56, 76, 50, 50, 75, 105, 65, 115, 95, 130, 125, 67, 67, 95, 86, 130, 110, 125, 80, 70, 65, 95, 145, 135, 70, 77, 107, 107, 107, 107, 107, 130, 105, 70, 120, 100, 106, 110, 120, 100, 120, 80, 100, 90, 100, 75, 120, 100, 55, 75, 95, 45, 55, 65, 45, 60, 85, 39, 69, 45, 65, 90, 37, 50, 48, 63, 48, 63, 48, 63, 45, 85, 50, 62, 80, 32, 63, 85, 105, 130, 43, 55, 40, 60, 86, 126, 55, 85, 95, 40, 55, 75, 85, 75, 70, 90, 80, 59, 99, 89, 60, 85, 50, 75, 65, 35, 45, 80, 45, 55, 105, 67, 85, 125, 70, 115, 80, 85, 145, 103, 133, 45, 65, 62, 82, 40, 60, 40, 60, 50, 70, 95, 40, 50, 75, 50, 63, 50, 65, 85, 50, 70, 60, 45, 105, 45, 70, 50, 70, 80, 50, 60, 91, 131, 70, 95, 115, 40)
pokemon[,3] <- c( v3, 70, 80, 55, 75, 55, 60, 90, 60, 70, 90, 40, 80, 30, 85, 40, 84, 50, 60, 90, 50, 80, 70, 100, 95, 50, 75, 75, 105, 66, 112, 50, 70, 90, 55, 65, 129, 90, 72, 70, 80, 70, 70, 100, 120, 90, 90, 90, 100, 90, 90, 90, 77, 90, 95, 65, 95, 122, 40, 58, 72, 40, 52, 67, 38, 77, 43, 55, 71, 40, 60, 50, 58, 72, 39, 47, 68, 48, 62, 62, 78, 60, 54, 76, 76, 100, 150, 50, 150, 60, 72, 66, 86, 53, 88, 67, 115, 60, 90, 62, 88, 33, 52, 77, 119, 50, 72, 65, 75, 57, 150, 35, 53, 70, 91, 48, 76, 70, 70, 70, 70, 122, 122, 122, 122, 85, 184, 35, 80, 95, 95, 121, 150, 110, 60, 60, 120 )
v4 <- c(65, 80, 100, 122, 60, 80, 109, 130, 159, 50, 65, 85, 135, 20, 25, 90, 20, 25, 45, 15, 35, 50, 70, 135, 25, 50, 31, 61, 40, 65, 50, 90, 20, 45, 40, 55, 75, 40, 55, 85, 60, 95, 50, 81, 45, 85, 30, 65, 75, 85, 110, 45, 60, 40, 90, 35, 50, 40, 65, 65, 95, 35, 60, 70, 100, 40, 50, 70, 105, 120, 135, 175, 35, 50, 65, 70, 85, 100, 50, 80, 30, 45, 55, 65, 80, 40, 100, 130, 95, 120, 58, 35, 60, 45, 70, 40, 65, 45, 85, 100, 115, 130, 170, 30, 43, 73, 25, 50, 55, 80, 60, 125, 40, 50, 35, 35, 60, 60, 85, 30, 45, 35, 100, 40, 60, 70, 95, 35, 65, 70, 100, 100, 55, 115, 95, 100, 55, 65, 40, 15, 60, 70, 85, 48, 45, 110, 110, 95, 85, 90, 115, 55, 65, 60, 70, 65, 95, 125, 125, 50, 70, 100, 154, 154, 194, 100, 49, 63, 83, 60, 80, 109, 44, 59, 79, 35, 45, 36, 76, 40, 55, 40, 60, 70, 56, 76, 35, 45, 40, 40, 80, 70, 95, 65, 80, 115, 165, 90, 20, 60, 30, 90, 35, 45, 55, 40, 30, 105, 75, 25, 65, 130, 60, 85, 100, 85, 72, 33, 90, 35, 60, 65, 35, 55, 55, 40, 60, 55, 55, 65, 10, 40, 40, 35, 50, 75, 70, 80, 30, 60, 65, 65, 105, 65, 80, 40, 80, 110, 140, 95, 40, 60, 105, 85, 20, 35, 35, 85, 65, 70, 40, 75, 115, 90, 90, 45, 65, 95, 95, 90, 110, 100, 65, 85, 105, 145, 70, 85, 110, 130, 50, 60, 85, 95, 30, 60, 30, 50, 20, 25, 100, 25, 50, 40, 60, 90, 30, 60, 90, 30, 50, 55, 85, 45, 65, 125, 165, 50, 80, 40, 60, 35, 55, 95, 30, 50, 30, 51, 71, 91, 20, 40, 20, 45, 35, 55, 65, 85, 55, 55, 40, 50, 60, 60, 40, 60, 80, 65, 105, 135, 85, 75, 47, 73, 100, 43, 73, 65, 95, 110, 70, 90, 65, 105, 145, 85, 70, 90, 60, 45, 50, 80, 85, 115, 40, 70, 110, 60, 100, 95, 55, 46, 76, 50, 90, 40, 70, 61, 81, 40, 70, 10, 100, 70, 60, 63, 83, 93, 30, 60, 72, 95, 75, 115, 23, 50, 80, 120, 55, 75, 95, 74, 94, 114, 45, 40, 40, 60, 110, 120, 35, 55, 95, 105, 50, 100, 75, 110, 140, 130, 160, 150, 180, 100, 150, 150, 180, 100, 150, 180, 70, 95, 45, 55, 75, 58, 78, 104, 61, 81, 111, 30, 40, 50, 35, 55, 25, 55, 40, 60, 95, 50, 125, 30, 65, 42, 47, 29, 79, 59, 69, 94, 30, 80, 45, 60, 85, 62, 87, 57, 92, 60, 60, 90, 44, 54, 54, 105, 105, 42, 64, 65, 41, 71, 24, 79, 10, 70, 15, 92, 92, 40, 50, 80, 120, 40, 35, 115, 140, 38, 68, 30, 60, 61, 86, 90, 49, 69, 60, 62, 92, 132, 45, 130, 80, 55, 110, 95, 125, 120, 116, 60, 130, 45, 70, 135, 65, 65, 75, 65, 80, 95, 105, 105, 105, 105, 105, 75, 105, 125, 150, 150, 130, 80, 100, 120, 75, 80, 100, 135, 100, 120, 120, 100, 45, 60, 75, 45, 70, 100, 63, 83, 108, 35, 60, 25, 35, 45, 50, 88, 53, 98, 53, 98, 53, 98, 67, 107, 36, 50, 65, 50, 80, 25, 50, 60, 55, 77, 30, 50, 60, 80, 25, 40, 55, 50, 65, 85, 30, 30, 40, 50, 70, 30, 40, 55, 37, 77, 70, 110, 80, 35, 45, 65, 15, 30, 140, 106, 35, 65, 35, 45, 103, 55, 95, 53, 83, 74, 112, 40, 60, 80, 120, 40, 65, 55, 75, 95, 105, 125, 125, 44, 87, 65, 80, 110, 40, 60, 75, 40, 60, 55, 85, 65, 85, 40, 57, 97, 24, 54, 45, 70, 70)
pokemon[,4] <- c( v4, 45, 75, 105, 85, 125, 65, 95, 145, 30, 40, 60, 60, 70, 95, 40, 100, 81, 55, 95, 60, 35, 55, 40, 60, 40, 37, 57, 45, 55, 105, 48, 45, 65, 125, 50, 135, 90, 72, 90, 125, 110, 125, 145, 150, 120, 115, 105, 130, 120, 170, 129, 129, 128, 77, 120, 48, 56, 74, 62, 90, 114, 62, 83, 103, 32, 50, 40, 56, 74, 27, 27, 90, 73, 109, 61, 75, 112, 62, 97, 46, 69, 65, 63, 83, 83, 35, 45, 150, 50, 63, 99, 59, 85, 37, 68, 39, 54, 60, 97, 58, 120, 61, 109, 45, 69, 67, 99, 110, 74, 81, 50, 55, 83, 110, 80, 50, 65, 44, 44, 44, 44, 58, 58, 58, 58, 32, 44, 45, 97, 131, 131, 81, 100, 160, 150, 170, 130 )
v5 <- c(65, 80, 100, 120, 50, 65, 85, 85, 115, 64, 80, 105, 115, 20, 25, 80, 20, 25, 80, 80, 35, 50, 70, 80, 35, 70, 31, 61, 54, 79, 50, 80, 30, 55, 40, 55, 85, 40, 55, 75, 65, 90, 65, 100, 25, 50, 40, 75, 65, 75, 90, 55, 80, 55, 75, 45, 70, 40, 65, 50, 80, 45, 70, 50, 80, 40, 50, 90, 55, 70, 95, 95, 35, 60, 85, 30, 45, 70, 100, 120, 30, 45, 65, 65, 80, 40, 80, 80, 55, 70, 62, 35, 60, 70, 95, 50, 100, 25, 45, 35, 55, 75, 95, 45, 90, 115, 25, 50, 55, 80, 45, 65, 50, 80, 110, 110, 75, 45, 70, 30, 45, 105, 40, 80, 100, 25, 45, 50, 80, 55, 85, 120, 80, 95, 85, 85, 70, 90, 70, 20, 100, 130, 95, 48, 65, 95, 95, 110, 75, 55, 70, 45, 70, 75, 95, 110, 125, 90, 85, 50, 70, 100, 90, 100, 120, 100, 65, 80, 100, 50, 65, 85, 48, 63, 83, 45, 55, 56, 96, 80, 110, 40, 60, 80, 56, 76, 35, 55, 20, 65, 105, 45, 70, 45, 60, 90, 110, 100, 50, 80, 65, 100, 55, 65, 95, 55, 30, 85, 45, 25, 65, 95, 130, 42, 110, 85, 48, 58, 65, 35, 60, 65, 65, 65, 95, 40, 60, 55, 80, 100, 230, 95, 105, 75, 50, 75, 40, 80, 30, 60, 85, 35, 75, 45, 140, 70, 50, 80, 90, 95, 40, 60, 95, 65, 45, 35, 110, 65, 55, 55, 70, 135, 100, 75, 115, 50, 70, 100, 120, 154, 154, 100, 55, 65, 85, 85, 50, 60, 70, 80, 50, 70, 90, 110, 30, 60, 41, 61, 30, 25, 50, 25, 90, 50, 70, 100, 30, 40, 60, 30, 50, 30, 70, 35, 55, 115, 135, 52, 82, 60, 60, 35, 55, 65, 30, 50, 30, 23, 43, 73, 30, 60, 40, 90, 35, 55, 65, 115, 55, 95, 40, 50, 60, 80, 55, 75, 85, 40, 60, 80, 75, 85, 75, 75, 80, 53, 83, 20, 40, 65, 35, 45, 45, 75, 105, 70, 80, 110, 60, 45, 50, 80, 40, 60, 75, 105, 105, 60, 60, 85, 65, 41, 71, 35, 55, 70, 120, 87, 107, 50, 80, 55, 125, 70, 120, 33, 63, 83, 90, 130, 87, 80, 60, 60, 48, 50, 80, 80, 50, 70, 90, 55, 75, 75, 65, 65, 30, 50, 80, 90, 60, 80, 90, 110, 100, 200, 150, 130, 150, 110, 120, 140, 160, 90, 90, 90, 100, 100, 50, 20, 160, 90, 55, 65, 85, 44, 52, 71, 56, 76, 101, 30, 40, 60, 40, 60, 41, 51, 34, 49, 79, 70, 105, 30, 50, 88, 138, 45, 105, 85, 95, 50, 42, 102, 90, 30, 50, 53, 78, 62, 82, 66, 44, 54, 56, 96, 96, 105, 52, 37, 59, 50, 41, 61, 86, 116, 45, 90, 65, 42, 108, 45, 55, 85, 95, 85, 40, 70, 70, 42, 72, 55, 75, 40, 65, 72, 61, 86, 120, 60, 85, 105, 85, 90, 95, 55, 50, 85, 95, 115, 56, 65, 95, 75, 60, 75, 115, 115, 150, 135, 70, 77, 107, 107, 107, 107, 107, 130, 105, 70, 100, 120, 106, 110, 120, 100, 130, 80, 100, 90, 100, 75, 120, 100, 55, 75, 95, 45, 55, 65, 45, 60, 70, 39, 69, 45, 65, 90, 37, 50, 48, 63, 48, 63, 48, 63, 55, 95, 30, 42, 55, 32, 63, 25, 40, 80, 43, 55, 45, 65, 86, 126, 35, 50, 65, 40, 55, 75, 85, 75, 60, 80, 80, 39, 79, 69, 50, 75, 50, 75, 55, 35, 45, 70, 45, 55, 105, 67, 35, 75, 70, 115, 80, 65, 105, 45, 65, 45, 65, 62, 82, 40, 60, 40, 60, 65, 85, 110, 50, 60, 85, 50, 63, 60, 75, 95, 50, 70, 60, 45, 105, 55, 80, 85, 105, 45, 50, 60, 86, 116, 60, 85, 85, 40)
pokemon[,5] <- c( v5, 70, 80, 55, 95, 55, 60, 90, 40, 50, 70, 40, 80, 135, 65, 60, 99, 50, 60, 90, 50, 80, 40, 70, 95, 50, 75, 65, 95, 66, 48, 50, 70, 90, 55, 105, 72, 90, 129, 80, 90, 80, 80, 120, 100, 80, 80, 90, 90, 100, 90, 90, 128, 77, 95, 45, 58, 75, 60, 70, 100, 44, 56, 71, 36, 77, 38, 52, 69, 25, 30, 50, 54, 66, 79, 98, 154, 57, 81, 48, 71, 90, 60, 81, 81, 37, 49, 50, 150, 65, 89, 57, 75, 46, 75, 56, 86, 60, 123, 63, 89, 43, 94, 45, 59, 63, 92, 130, 63, 67, 150, 75, 113, 150, 87, 60, 82, 55, 55, 55, 55, 75, 75, 75, 75, 35, 46, 40, 80, 98, 98, 95, 150, 110, 130, 130, 90 )
v6 <- c(45, 60, 80, 80, 65, 80, 100, 100, 100, 43, 58, 78, 78, 45, 30, 70, 50, 35, 75, 145, 56, 71, 101, 121, 72, 97, 70, 100, 55, 80, 90, 110, 40, 65, 41, 56, 76, 50, 65, 85, 35, 60, 65, 100, 20, 45, 55, 90, 30, 40, 50, 25, 30, 45, 90, 95, 120, 90, 115, 55, 85, 70, 95, 60, 95, 90, 90, 70, 90, 105, 120, 150, 35, 45, 55, 40, 55, 70, 70, 100, 20, 35, 45, 90, 105, 15, 30, 30, 45, 70, 60, 75, 100, 45, 70, 25, 50, 40, 70, 80, 95, 110, 130, 70, 42, 67, 50, 75, 100, 140, 40, 55, 35, 45, 87, 76, 30, 35, 60, 25, 40, 50, 60, 90, 100, 60, 85, 63, 68, 85, 115, 90, 105, 95, 105, 93, 85, 105, 110, 80, 81, 81, 60, 48, 55, 65, 130, 65, 40, 35, 55, 55, 80, 130, 150, 30, 85, 100, 90, 50, 70, 80, 130, 130, 140, 100, 45, 60, 80, 65, 80, 100, 43, 58, 78, 20, 90, 50, 70, 55, 85, 30, 40, 130, 67, 67, 60, 15, 15, 20, 40, 70, 95, 35, 45, 55, 45, 50, 40, 50, 30, 70, 50, 80, 110, 85, 30, 30, 95, 15, 35, 110, 65, 91, 30, 85, 48, 33, 85, 15, 40, 45, 85, 30, 30, 30, 45, 85, 65, 75, 5, 85, 75, 115, 40, 55, 20, 30, 50, 50, 35, 65, 45, 75, 70, 70, 65, 95, 115, 85, 40, 50, 60, 85, 75, 35, 70, 65, 95, 83, 100, 55, 115, 100, 85, 41, 51, 61, 71, 110, 90, 100, 70, 95, 120, 145, 45, 55, 80, 100, 40, 50, 60, 70, 35, 70, 60, 100, 20, 15, 65, 15, 65, 30, 50, 70, 30, 60, 80, 85, 125, 85, 65, 40, 50, 80, 100, 65, 60, 35, 70, 30, 90, 100, 40, 160, 40, 28, 48, 68, 25, 50, 20, 30, 50, 70, 50, 20, 50, 50, 30, 40, 50, 50, 60, 80, 100, 65, 105, 135, 95, 95, 85, 85, 65, 40, 55, 65, 95, 105, 60, 60, 35, 40, 20, 20, 60, 80, 60, 10, 70, 100, 35, 55, 50, 80, 80, 90, 65, 70, 70, 60, 60, 35, 55, 55, 75, 23, 43, 75, 45, 80, 81, 70, 40, 45, 65, 75, 25, 25, 51, 65, 75, 115, 23, 50, 80, 100, 25, 45, 65, 32, 52, 52, 55, 97, 50, 50, 100, 120, 30, 50, 70, 110, 50, 50, 50, 110, 110, 110, 110, 90, 90, 90, 90, 95, 115, 100, 150, 150, 90, 180, 31, 36, 56, 61, 81, 108, 40, 50, 60, 60, 80, 100, 31, 71, 25, 65, 45, 60, 70, 55, 90, 58, 58, 30, 30, 36, 36, 36, 36, 66, 70, 40, 95, 85, 115, 35, 85, 34, 39, 115, 70, 80, 85, 105, 135, 105, 71, 85, 112, 45, 74, 84, 23, 33, 10, 60, 30, 91, 35, 42, 82, 102, 92, 5, 60, 90, 112, 32, 47, 65, 95, 50, 85, 46, 66, 91, 50, 40, 60, 30, 125, 60, 50, 40, 50, 95, 83, 80, 95, 95, 65, 95, 80, 90, 80, 110, 40, 45, 110, 91, 86, 86, 86, 86, 86, 95, 80, 115, 90, 100, 77, 100, 90, 90, 85, 80, 100, 125, 100, 127, 120, 100, 63, 83, 113, 45, 55, 65, 45, 60, 70, 42, 77, 55, 60, 80, 66, 106, 64, 101, 64, 101, 64, 101, 24, 29, 43, 65, 93, 76, 116, 15, 20, 25, 72, 114, 68, 88, 50, 50, 35, 40, 45, 64, 69, 74, 45, 85, 42, 42, 92, 57, 47, 112, 66, 116, 30, 90, 98, 65, 74, 92, 50, 95, 55, 60, 55, 45, 48, 58, 97, 30, 30, 22, 32, 70, 110, 65, 75, 65, 105, 75, 115, 45, 55, 65, 20, 30, 30, 55, 98, 44, 59, 79, 75, 95, 103, 60, 20, 15, 30, 40, 60, 65, 65, 108, 10, 20, 30, 50, 90, 60)
pokemon[,6] <- c( v6, 40, 50, 30, 40, 20, 55, 80, 57, 67, 97, 40, 50, 105, 25, 145, 32, 65, 105, 48, 35, 55, 60, 70, 55, 60, 80, 60, 80, 65, 109, 38, 58, 98, 60, 100, 108, 108, 108, 111, 121, 111, 101, 90, 90, 101, 91, 95, 95, 95, 108, 108, 90, 128, 99, 38, 57, 64, 60, 73, 104, 71, 97, 122, 57, 78, 62, 84, 126, 35, 29, 89, 72, 106, 42, 52, 75, 52, 68, 43, 58, 102, 68, 104, 104, 28, 35, 60, 60, 23, 29, 49, 72, 45, 73, 50, 68, 30, 44, 44, 59, 70, 109, 48, 71, 46, 58, 60, 118, 101, 50, 40, 60, 80, 75, 38, 56, 51, 56, 46, 41, 84, 99, 69, 54, 28, 28, 55, 123, 99, 99, 95, 50, 110, 70, 80, 70 )
colnames(pokemon) <- c("HitPoints", "Attack", "Defense", "SpecialAttack", "SpecialDefense", "Speed")
str(pokemon)
## num [1:800, 1:6] 45 60 80 80 39 58 78 78 78 44 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:6] "HitPoints" "Attack" "Defense" "SpecialAttack" ...
apply(pokemon, 2, FUN=mean)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 69.25875 79.00125 73.84250 72.82000 71.90250
## Speed
## 68.27750
# Initialize total within sum of squares error: wss
wss <- 0
# Look over 1 to 15 possible clusters
for (i in 1:15) {
# Fit the model: km.out
km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
# Save the within cluster sum of squares
wss[i] <- km.out$tot.withinss
}
# Produce a scree plot
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
# Select number of clusters
k <- 3
# Build model with k clusters: km.out
km.out <- kmeans(pokemon, centers = k, nstart = 20, iter.max = 50)
# View the resulting model
km.out
## K-means clustering with 3 clusters of sizes 270, 175, 355
##
## Cluster means:
## HitPoints Attack Defense SpecialAttack SpecialDefense Speed
## 1 81.90370 96.15926 77.65556 104.12222 86.87778 94.71111
## 2 79.30857 97.29714 108.93143 66.71429 87.04571 57.29143
## 3 54.68732 56.93239 53.64507 52.02254 53.04789 53.58873
##
## Clustering vector:
## [1] 3 3 1 1 3 3 1 1 1 3 3 2 1 3 3 3 3 3 3 1 3 3 1 1 3 3 3 1 3 1 3 1 3 2 3
## [36] 3 2 3 3 1 3 1 3 1 3 3 3 1 3 3 1 3 2 3 1 3 3 3 1 3 1 3 1 3 1 3 3 2 3 1
## [71] 1 1 3 2 2 3 3 1 3 1 3 2 2 3 1 3 2 2 3 1 3 3 1 3 2 3 2 3 2 3 1 1 1 2 3
## [106] 2 3 2 3 1 3 1 3 2 2 2 3 3 2 3 2 3 2 2 2 3 1 3 2 3 1 1 1 1 1 1 2 2 2 3
## [141] 2 2 2 3 3 1 1 1 3 3 2 3 2 1 1 2 1 1 1 3 3 1 1 1 1 1 3 3 2 3 3 1 3 3 2
## [176] 3 3 3 1 3 3 3 3 1 3 1 3 3 3 3 3 3 1 3 3 1 1 2 3 3 2 1 3 3 1 3 3 3 3 3
## [211] 2 1 2 3 2 1 3 3 1 3 2 3 2 2 2 3 2 3 2 2 2 2 2 3 3 2 3 2 3 2 3 3 1 3 1
## [246] 2 3 1 1 1 3 2 1 1 3 3 2 3 3 3 2 1 1 1 2 3 3 2 2 1 1 1 3 3 1 1 3 3 1 1
## [281] 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3 1 3 3 1 3 3 3 2 3 3 1 1 3 3 3 2 3 3 1 3
## [316] 1 3 3 3 1 3 2 3 2 3 3 3 2 3 2 3 2 2 2 3 3 1 3 1 1 3 3 3 3 3 3 2 3 1 1
## [351] 3 1 3 1 2 2 3 1 3 3 3 1 3 1 3 2 1 1 1 1 2 3 2 3 2 3 2 3 2 3 2 3 1 3 2
## [386] 3 1 1 3 2 2 3 1 1 3 3 1 1 3 3 1 3 2 2 2 3 3 2 1 1 3 2 2 1 2 2 2 1 1 1
## [421] 1 1 1 2 1 1 1 1 1 1 2 1 3 2 2 3 3 1 3 3 1 3 3 1 3 3 3 3 3 3 1 3 1 3 2
## [456] 3 2 3 2 2 2 1 3 2 3 3 1 3 1 3 2 1 3 1 3 1 1 1 1 3 1 3 3 1 3 2 3 3 3 3
## [491] 2 3 3 1 1 3 3 1 1 3 2 3 2 3 1 2 3 1 3 3 1 2 1 1 2 2 2 1 1 1 1 2 1 2 1
## [526] 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 3 3 1 3 3 1
## [561] 3 3 1 3 3 3 3 2 3 1 3 1 3 1 3 1 3 2 3 3 1 3 1 3 2 2 3 1 3 1 2 2 3 2 2
## [596] 3 3 1 2 2 3 3 1 3 3 1 3 1 3 1 1 3 3 1 3 2 1 1 3 2 3 2 1 3 2 3 2 3 1 3
## [631] 2 3 1 3 1 3 3 2 3 3 1 3 1 3 3 1 3 1 1 3 2 3 2 3 1 2 3 1 3 2 3 2 2 3 3
## [666] 1 3 1 3 3 1 3 2 2 3 2 1 3 1 2 3 1 2 3 2 3 2 2 3 2 3 2 1 2 3 3 1 3 1 1
## [701] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 2 3 3 1 3 3 1 3 3 3 3 1 3 3 3
## [736] 3 1 3 3 1 3 1 3 2 1 3 1 1 3 2 1 2 3 2 3 1 3 2 3 2 3 2 3 1 3 1 3 2 3 1
## [771] 1 1 1 2 3 1 1 2 3 2 3 3 3 3 2 2 2 2 3 2 3 1 1 1 2 2 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 1018348.0 709020.5 812079.9
## (between_SS / total_SS = 40.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Plot of Defense vs. Speed by cluster membership
plot(pokemon[, c("Defense", "Speed")],
col = km.out$cluster,
main = paste("k-means clustering of Pokemon with", k, "clusters"),
xlab = "Defense", ylab = "Speed")
Chapter 2 - Hierarchical Clustering
Introduction to hierarchical clustering - creating clusters when the number of clusters is not known ahead of time:
Selecting the number of clusters - dendrograms (trees):
Clustering linkage and practical matters - how to determine distances between clusters:
Example code includes:
x <- matrix(data=NA, nrow=50, ncol=2)
x[, 1] <- c( 3.37, 1.44, 2.36, 2.63, 2.4, 1.89, 3.51, 1.91, 4.02, 1.94, 3.3, 4.29, 0.61, 1.72,
1.87, 2.64, 1.72, -0.66, -0.44, 3.32, 1.69, 0.22, 1.83, 3.21, 3.9, -5.43, -5.26,
-6.76, -4.54, -5.64, -4.54, -4.3, -3.96, -5.61, -4.5, -1.72, -0.78, -0.85, -2.41,
0.04, 0.21, -0.36, 0.76, -0.73, -1.37, 0.43, -0.81, 1.44, -0.43, 0.66
)
x[, 2] <- c( 2.32, 1.22, 3.58, 2.64, 2.09, 2.28, 2.68, 2.09, -0.99, 2.28, 1.63, 2.19, 2.58, 3.4,
1.27, 3.3, 2.34, 3.04, 2.92, 2.72, 0.96, 1.91, 2.62, 1.05, 1.46, 2.58, 2.77, 2.46,
1.11, 0.9, 3.51, 2.26, 2.09, 1.88, 0.81, -1.39, -2.22, -2.18, -1.07, -1.18, -0.61,
-2.48, -1.35, -0.61, -3.11, -2.86, -3.13, -3.46, -1.92, -1.35
)
str(x)
## num [1:50, 1:2] 3.37 1.44 2.36 2.63 2.4 1.89 3.51 1.91 4.02 1.94 ...
# Create hierarchical clustering model: hclust.out
hclust.out <- hclust(d=dist(x))
# Inspect the result
summary(hclust.out)
## Length Class Mode
## merge 98 -none- numeric
## height 49 -none- numeric
## order 50 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 2 -none- call
## dist.method 1 -none- character
# Cut by height
cutree(hclust.out, h=7)
## [1] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
# Cut by number of clusters
cutree(hclust.out, k=3)
## [1] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
# Cluster using complete linkage: hclust.complete
hclust.complete <- hclust(dist(x), method="complete")
# Cluster using average linkage: hclust.average
hclust.average <- hclust(dist(x), method="average")
# Cluster using single linkage: hclust.single
hclust.single <- hclust(dist(x), method="single")
# Plot dendrogram of hclust.complete
plot(hclust.complete)
# Plot dendrogram of hclust.average
plot(hclust.average)
# Plot dendrogram of hclust.single
plot(hclust.single)
# View column means
colMeans(pokemon)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 69.25875 79.00125 73.84250 72.82000 71.90250
## Speed
## 68.27750
# View column standard deviations
apply(pokemon, 2, FUN=sd)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 25.53467 32.45737 31.18350 32.72229 27.82892
## Speed
## 29.06047
# Scale the data
pokemon.scaled <- scale(pokemon)
# Create hierarchical clustering model: hclust.pokemon
hclust.pokemon <- hclust(dist(pokemon.scaled), method="complete")
Chapter 3 - Dimensionality Reduction with PCA
Introduction to PCA - a popular type of dimensionality reduction to find structure in features, and aid in visualization:
Visualizing and intepreting PCA results:
Practical issues with PCA - scaling, missing values (drop and/or impute), categorical data (drop or encode as numbers):
Example code includes:
pokemon <- matrix(nrow=50, ncol=4, byrow=FALSE,
data=c( 58, 90, 70, 60, 60, 44, 100, 80, 80, 60, 150, 62, 75, 70, 115, 74, 74,
40, 95, 80, 25, 51, 48, 45, 35, 20, 60, 70, 70, 80, 57, 64, 75, 101, 50,
60, 85, 95, 58, 100, 95, 91, 62, 70, 60, 70, 50, 50, 70, 150, 64, 100,
94, 80, 55, 38, 77, 145, 100, 55, 100, 77, 98, 130, 45, 108, 94, 35, 65,
120, 35, 65, 72, 45, 55, 40, 70, 20, 55, 100, 24, 78, 98, 72, 75, 100, 120,
155, 89, 150, 125, 90, 48, 40, 110, 85, 85, 50, 110, 120, 58, 70, 50, 110,
90, 33, 77, 150, 70, 145, 120, 62, 63, 100, 20, 133, 131, 30, 65, 130, 70,
65, 48, 55, 40, 90, 50, 50, 65, 80, 86, 52, 63, 72, 70, 89, 70, 109, 77,
120, 79, 129, 54, 50, 70, 140, 40, 62, 70, 100, 80, 80, 66, 45, 80, 70,
90, 110, 95, 40, 90, 65, 101, 65, 20, 32, 20, 105, 60, 45, 45, 59, 48, 63,
60, 25, 65, 40, 70, 100, 23, 81, 101, 29, 48, 112, 100, 81, 48, 90, 81,
108, 68, 25, 100, 20, 35, 65, 90, 90
)
)
colnames(pokemon) <- c( "HitPoint", "Attack", "Defense", "Speed" )
rownames(pokemon) <- c( 'Quilava', 'Goodra', 'Mothim', 'Marowak', 'Chandelure', 'Helioptile',
'MeloettaAria Forme', 'MetagrossMega Metagross', 'Sawsbuck', 'Probopass',
'GiratinaAltered Forme', 'Tranquill', 'Simisage', 'Scizor', 'Jigglypuff',
'Carracosta', 'Ferrothorn', 'Kadabra', 'Sylveon', 'Golem', 'Magnemite',
'Vanillish', 'Unown', 'Snivy', 'Tynamo', 'Duskull', 'Beautifly', 'Marill',
'Lunatone', 'Flygon', 'Bronzor', 'Monferno', 'Simisear', 'Aromatisse',
'Scraggy', 'Scolipede', 'Staraptor', 'GyaradosMega Gyarados', 'Tyrunt', 'Zekrom',
'Gyarados', 'Cobalion', 'Espurr', 'Spheal', 'Dodrio', 'Torkoal', 'Cacnea',
'Trubbish', 'Lucario', 'GiratinaOrigin Forme'
)
str(pokemon)
## num [1:50, 1:4] 58 90 70 60 60 44 100 80 80 60 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:50] "Quilava" "Goodra" "Mothim" "Marowak" ...
## ..$ : chr [1:4] "HitPoint" "Attack" "Defense" "Speed"
colMeans(pokemon)
## HitPoint Attack Defense Speed
## 71.08 81.22 78.44 66.58
head(pokemon)
## HitPoint Attack Defense Speed
## Quilava 58 64 58 80
## Goodra 90 100 70 80
## Mothim 70 94 50 66
## Marowak 60 80 110 45
## Chandelure 60 55 90 80
## Helioptile 44 38 33 70
# Perform scaled PCA: pr.out
pr.out <- prcomp(pokemon, scale=TRUE)
# Inspect model output
summary(pr.out)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.4420 1.0013 0.7941 0.53595
## Proportion of Variance 0.5199 0.2507 0.1577 0.07181
## Cumulative Proportion 0.5199 0.7705 0.9282 1.00000
biplot(pr.out)
# Variability of each principal component: pr.var
pr.var <- (pr.out$sdev)^2
# Variance explained by each principal component: pve
pve <- pr.var / sum(pr.var)
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cummulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
pokeTotal <- matrix(ncol=1, nrow=50,
data=c( 405, 600, 424, 425, 520, 289, 600, 700, 475, 525, 680, 358, 498, 500,
270, 495, 489, 400, 525, 495, 325, 395, 336, 308, 275, 295, 395, 250,
440, 520, 300, 405, 498, 462, 348, 485, 485, 640, 362, 680, 540, 580,
355, 290, 460, 470, 335, 329, 525, 680
)
)
pokemon <- cbind(pokeTotal, pokemon)
colnames(pokemon)[1] <- "Total"
str(pokemon)
## num [1:50, 1:5] 405 600 424 425 520 289 600 700 475 525 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:50] "Quilava" "Goodra" "Mothim" "Marowak" ...
## ..$ : chr [1:5] "Total" "HitPoint" "Attack" "Defense" ...
colMeans(pokemon)
## Total HitPoint Attack Defense Speed
## 448.82 71.08 81.22 78.44 66.58
# Mean of each variable
colMeans(pokemon)
## Total HitPoint Attack Defense Speed
## 448.82 71.08 81.22 78.44 66.58
# Standard deviation of each variable
apply(pokemon, 2, sd)
## Total HitPoint Attack Defense Speed
## 119.32321 25.62193 33.03078 32.05809 27.51036
# PCA model with scaling: pr.with.scaling
pr.with.scaling <- prcomp(pokemon, scale=TRUE)
# PCA model without scaling: pr.without.scaling
pr.without.scaling <- prcomp(pokemon, scale=FALSE)
# Create biplots of both for comparison
biplot(pr.with.scaling)
biplot(pr.without.scaling)
Chapter 4 - Case Study
Introduction to the case study:
PCA Review and Next Steps:
Example code includes:
# Cached to avoid repeated downloads
url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1903/datasets/WisconsinCancer.csv"
# Download the data: wisc.df
wisc.df <- read.csv(url, stringsAsFactors=FALSE)
# Convert the features of the data: wisc.data
wisc.data <- as.matrix(wisc.df[, 3:32])
# Set the row names of wisc.data
row.names(wisc.data) <- wisc.df$id
# Create diagnosis vector
diagnosis <- as.numeric(wisc.df$diagnosis == "M")
And, continuing with:
# Check column means and standard deviations
colMeans(wisc.data)
## radius_mean texture_mean perimeter_mean
## 1.412729e+01 1.928965e+01 9.196903e+01
## area_mean smoothness_mean compactness_mean
## 6.548891e+02 9.636028e-02 1.043410e-01
## concavity_mean concave.points_mean symmetry_mean
## 8.879932e-02 4.891915e-02 1.811619e-01
## fractal_dimension_mean radius_se texture_se
## 6.279761e-02 4.051721e-01 1.216853e+00
## perimeter_se area_se smoothness_se
## 2.866059e+00 4.033708e+01 7.040979e-03
## compactness_se concavity_se concave.points_se
## 2.547814e-02 3.189372e-02 1.179614e-02
## symmetry_se fractal_dimension_se radius_worst
## 2.054230e-02 3.794904e-03 1.626919e+01
## texture_worst perimeter_worst area_worst
## 2.567722e+01 1.072612e+02 8.805831e+02
## smoothness_worst compactness_worst concavity_worst
## 1.323686e-01 2.542650e-01 2.721885e-01
## concave.points_worst symmetry_worst fractal_dimension_worst
## 1.146062e-01 2.900756e-01 8.394582e-02
apply(wisc.data, 2, FUN=sd)
## radius_mean texture_mean perimeter_mean
## 3.524049e+00 4.301036e+00 2.429898e+01
## area_mean smoothness_mean compactness_mean
## 3.519141e+02 1.406413e-02 5.281276e-02
## concavity_mean concave.points_mean symmetry_mean
## 7.971981e-02 3.880284e-02 2.741428e-02
## fractal_dimension_mean radius_se texture_se
## 7.060363e-03 2.773127e-01 5.516484e-01
## perimeter_se area_se smoothness_se
## 2.021855e+00 4.549101e+01 3.002518e-03
## compactness_se concavity_se concave.points_se
## 1.790818e-02 3.018606e-02 6.170285e-03
## symmetry_se fractal_dimension_se radius_worst
## 8.266372e-03 2.646071e-03 4.833242e+00
## texture_worst perimeter_worst area_worst
## 6.146258e+00 3.360254e+01 5.693570e+02
## smoothness_worst compactness_worst concavity_worst
## 2.283243e-02 1.573365e-01 2.086243e-01
## concave.points_worst symmetry_worst fractal_dimension_worst
## 6.573234e-02 6.186747e-02 1.806127e-02
# Execute PCA, scaling if appropriate: wisc.pr
wisc.pr <- prcomp(wisc.data, scale=TRUE)
# Look at summary of results
summary(wisc.pr)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion 0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion 0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
# Create a biplot of wisc.pr
biplot(wisc.pr)
# Scatter plot observations by components 1 and 2
plot(wisc.pr$x[, c(1, 2)], col = (diagnosis + 1),
xlab = "PC1", ylab = "PC2")
# Repeat for components 1 and 3
plot(wisc.pr$x[, c(1, 3)], col = (diagnosis + 1),
xlab = "PC1", ylab = "PC3")
par(mfrow = c(1, 2))
# Calculate variability of each component
pr.var <- (wisc.pr$sdev)^2
# Variance explained by each principal component: pve
pve <- pr.var / sum(pr.var)
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cummulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
par(mfrow = c(1, 1))
# Scale the wisc.data data: data.scaled
data.scaled <- scale(wisc.data)
# Calculate the (Euclidean) distances: data.dist
data.dist <- dist(data.scaled)
# Create a hierarchical clustering model: wisc.hclust
wisc.hclust <- hclust(data.dist, method="complete")
# Cut tree so that it has 4 clusters: wisc.hclust.clusters
wisc.hclust.clusters <- cutree(wisc.hclust, k=4)
# Compare cluster membership to actual diagnoses
table(wisc.hclust.clusters, diagnosis)
## diagnosis
## wisc.hclust.clusters 0 1
## 1 12 165
## 2 2 5
## 3 343 40
## 4 0 2
# Create a k-means model on wisc.data: wisc.km
wisc.km <- kmeans(scale(wisc.data), centers=2, nstart=20)
# Compare k-means to actual diagnoses
table(wisc.km$cluster, diagnosis)
## diagnosis
## 0 1
## 1 343 37
## 2 14 175
# Compare k-means to hierarchical clustering
table(wisc.km$cluster, wisc.hclust.clusters)
## wisc.hclust.clusters
## 1 2 3 4
## 1 17 0 363 0
## 2 160 7 20 2
# Create a hierarchical clustering model: wisc.pr.hclust
wisc.pr.hclust <- hclust(dist(wisc.pr$x[, 1:7]), method = "complete")
# Cut model into 4 clusters: wisc.pr.hclust.clusters
wisc.pr.hclust.clusters <- cutree(wisc.pr.hclust, k=4)
# Compare to actual diagnoses
table(wisc.pr.hclust.clusters, diagnosis)
## diagnosis
## wisc.pr.hclust.clusters 0 1
## 1 5 113
## 2 350 97
## 3 2 0
## 4 0 2
# Compare to k-means and hierarchical
table(wisc.km$cluster, diagnosis)
## diagnosis
## 0 1
## 1 343 37
## 2 14 175
table(wisc.hclust.clusters, diagnosis)
## diagnosis
## wisc.hclust.clusters 0 1
## 1 12 165
## 2 2 5
## 3 343 40
## 4 0 2
Chapter 1 - Regression Models: Fitting and Training
Max Kuhn, author of the caret package for supervised learning:
Out-of-sample error measurement - Zach Mayer, co-author of the caret package:
Cross-validation - improved approach of taking multiple test/train and averaging out-of-sample error rates:
Example code includes:
data(diamonds, package="ggplot2")
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# Fit lm model: model
model <- lm(price ~ ., data=diamonds)
# Predict on full data: p
p <- predict(model)
# Compute errors: error
error <- p - diamonds$price
# Calculate RMSE
sqrt(mean(error^2))
## [1] 1129.843
# Shuffle row indices: rows
rows <- sample(nrow(diamonds), replace=FALSE)
# Randomly order data
diamonds <- diamonds[rows, ]
# Determine row to split on: split
split <- round(nrow(diamonds) * 0.8)
# Create train
train <- diamonds[1:split, ]
# Create test
test <- diamonds[-(1:split), ]
# Fit lm model on train: model
model <- lm(price ~ ., data=train)
# Predict on test: p
p <- predict(model, newdata=test)
# Compute errors: error
error <- p - test$price
# Calculate RMSE
sqrt(mean(error^2))
## [1] 1151.858
# Fit lm model using 10-fold CV: model
model <- caret::train(
price ~ ., data=diamonds,
method = "lm",
trControl = caret::trainControl(
method = "cv", number = 10,
verboseIter = TRUE
)
)
## Warning: package 'caret' was built under R version 3.2.4
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## + Fold01: parameter=none
## - Fold01: parameter=none
## + Fold02: parameter=none
## - Fold02: parameter=none
## + Fold03: parameter=none
## - Fold03: parameter=none
## + Fold04: parameter=none
## - Fold04: parameter=none
## + Fold05: parameter=none
## - Fold05: parameter=none
## + Fold06: parameter=none
## - Fold06: parameter=none
## + Fold07: parameter=none
## - Fold07: parameter=none
## + Fold08: parameter=none
## - Fold08: parameter=none
## + Fold09: parameter=none
## - Fold09: parameter=none
## + Fold10: parameter=none
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Linear Regression
##
## 53940 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 48547, 48547, 48545, 48545, 48545, 48547, ...
## Resampling results
##
## RMSE Rsquared RMSE SD Rsquared SD
## 1130.811 0.9196775 32.60512 0.004130502
##
##
data(BostonHousing, package="mlbench")
Boston <- BostonHousing
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# Fit lm model using 5-fold CV: model
model <- caret::train(
medv ~ ., data=Boston,
method = "lm",
trControl = caret::trainControl(
method = "cv", number = 5,
verboseIter = TRUE
)
)
## + Fold1: parameter=none
## - Fold1: parameter=none
## + Fold2: parameter=none
## - Fold2: parameter=none
## + Fold3: parameter=none
## - Fold3: parameter=none
## + Fold4: parameter=none
## - Fold4: parameter=none
## + Fold5: parameter=none
## - Fold5: parameter=none
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Linear Regression
##
## 506 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 405, 404, 404, 406, 405
## Resampling results
##
## RMSE Rsquared RMSE SD Rsquared SD
## 4.994366 0.7118604 0.8566057 0.0944254
##
##
# Fit lm model using 5 x 5-fold CV: model
model <- train(
medv ~ ., Boston,
method = "lm",
trControl = trainControl(
method = "cv", number = 5,
repeats = 5, verboseIter = TRUE
)
)
## + Fold1: parameter=none
## - Fold1: parameter=none
## + Fold2: parameter=none
## - Fold2: parameter=none
## + Fold3: parameter=none
## - Fold3: parameter=none
## + Fold4: parameter=none
## - Fold4: parameter=none
## + Fold5: parameter=none
## - Fold5: parameter=none
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Linear Regression
##
## 506 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 405, 404, 404, 406, 405
## Resampling results
##
## RMSE Rsquared RMSE SD Rsquared SD
## 4.894217 0.7161769 0.5744566 0.09604537
##
##
# Predict on full Boston dataset
predict(model, newdata=Boston)
## 1 2 3 4 5 6
## 30.0038434 25.0255624 30.5675967 28.6070365 27.9435242 25.2562845
## 7 8 9 10 11 12
## 23.0018083 19.5359884 11.5236369 18.9202621 18.9994965 21.5867957
## 13 14 15 16 17 18
## 20.9065215 19.5529028 19.2834821 19.2974832 20.5275098 16.9114013
## 19 20 21 22 23 24
## 16.1780111 18.4061360 12.5238575 17.6710367 15.8328813 13.8062853
## 25 26 27 28 29 30
## 15.6783383 13.3866856 15.4639765 14.7084743 19.5473729 20.8764282
## 31 32 33 34 35 36
## 11.4551176 18.0592329 8.8110574 14.2827581 13.7067589 23.8146353
## 37 38 39 40 41 42
## 22.3419371 23.1089114 22.9150261 31.3576257 34.2151023 28.0205641
## 43 44 45 46 47 48
## 25.2038663 24.6097927 22.9414918 22.0966982 20.4232003 18.0365509
## 49 50 51 52 53 54
## 9.1065538 17.2060775 21.2815254 23.9722228 27.6558508 24.0490181
## 55 56 57 58 59 60
## 15.3618477 31.1526495 24.8568698 33.1091981 21.7753799 21.0849356
## 61 62 63 64 65 66
## 17.8725804 18.5111021 23.9874286 22.5540887 23.3730864 30.3614836
## 67 68 69 70 71 72
## 25.5305651 21.1133856 17.4215379 20.7848363 25.2014886 21.7426577
## 73 74 75 76 77 78
## 24.5574496 24.0429571 25.5049972 23.9669302 22.9454540 23.3569982
## 79 80 81 82 83 84
## 21.2619827 22.4281737 28.4057697 26.9948609 26.0357630 25.0587348
## 85 86 87 88 89 90
## 24.7845667 27.7904920 22.1685342 25.8927642 30.6746183 30.8311062
## 91 92 93 94 95 96
## 27.1190194 27.4126673 28.9412276 29.0810555 27.0397736 28.6245995
## 97 98 99 100 101 102
## 24.7274498 35.7815952 35.1145459 32.2510280 24.5802202 25.5941347
## 103 104 105 106 107 108
## 19.7901368 20.3116713 21.4348259 18.5399401 17.1875599 20.7504903
## 109 110 111 112 113 114
## 22.6482911 19.7720367 20.6496586 26.5258674 20.7732364 20.7154831
## 115 116 117 118 119 120
## 25.1720888 20.4302559 23.3772463 23.6904326 20.3357836 20.7918087
## 121 122 123 124 125 126
## 21.9163207 22.4710778 20.5573856 16.3666198 20.5609982 22.4817845
## 127 128 129 130 131 132
## 14.6170663 15.1787668 18.9386859 14.0557329 20.0352740 19.4101340
## 133 134 135 136 137 138
## 20.0619157 15.7580767 13.2564524 17.2627773 15.8784188 19.3616395
## 139 140 141 142 143 144
## 13.8148390 16.4488147 13.5714193 3.9888551 14.5949548 12.1488148
## 145 146 147 148 149 150
## 8.7282236 12.0358534 15.8208206 8.5149902 9.7184414 14.8045137
## 151 152 153 154 155 156
## 20.8385815 18.3010117 20.1228256 17.2860189 22.3660023 20.1037592
## 157 158 159 160 161 162
## 13.6212589 33.2598270 29.0301727 25.5675277 32.7082767 36.7746701
## 163 164 165 166 167 168
## 40.5576584 41.8472817 24.7886738 25.3788924 37.2034745 23.0874875
## 169 170 171 172 173 174
## 26.4027396 26.6538211 22.5551466 24.2908281 22.9765722 29.0719431
## 175 176 177 178 179 180
## 26.5219434 30.7220906 25.6166931 29.1374098 31.4357197 32.9223157
## 181 182 183 184 185 186
## 34.7244046 27.7655211 33.8878732 30.9923804 22.7182001 24.7664781
## 187 188 189 190 191 192
## 35.8849723 33.4247672 32.4119915 34.5150995 30.7610949 30.2893414
## 193 194 195 196 197 198
## 32.9191871 32.1126077 31.5587100 40.8455572 36.1277008 32.6692081
## 199 200 201 202 203 204
## 34.7046912 30.0934516 30.6439391 29.2871950 37.0714839 42.0319312
## 205 206 207 208 209 210
## 43.1894984 22.6903480 23.6828471 17.8544721 23.4942899 17.0058772
## 211 212 213 214 215 216
## 22.3925110 17.0604275 22.7389292 25.2194255 11.1191674 24.5104915
## 217 218 219 220 221 222
## 26.6033477 28.3551871 24.9152546 29.6865277 33.1841975 23.7745666
## 223 224 225 226 227 228
## 32.1405196 29.7458199 38.3710245 39.8146187 37.5860575 32.3995325
## 229 230 231 232 233 234
## 35.4566524 31.2341151 24.4844923 33.2883729 38.0481048 37.1632863
## 235 236 237 238 239 240
## 31.7138352 25.2670557 30.1001074 32.7198716 28.4271706 28.4294068
## 241 242 243 244 245 246
## 27.2937594 23.7426248 24.1200789 27.4020841 16.3285756 13.3989126
## 247 248 249 250 251 252
## 20.0163878 19.8618443 21.2883131 24.0798915 24.2063355 25.0421582
## 253 254 255 256 257 258
## 24.9196401 29.9456337 23.9722832 21.6958089 37.5110924 43.3023904
## 259 260 261 262 263 264
## 36.4836142 34.9898859 34.8121151 37.1663133 40.9892850 34.4463409
## 265 266 267 268 269 270
## 35.8339755 28.2457430 31.2267359 40.8395575 39.3179239 25.7081791
## 271 272 273 274 275 276
## 22.3029553 27.2034097 28.5116947 35.4767660 36.1063916 33.7966827
## 277 278 279 280 281 282
## 35.6108586 34.8399338 30.3519266 35.3098070 38.7975697 34.3312319
## 283 284 285 286 287 288
## 40.3396307 44.6730834 31.5968909 27.3565923 20.1017415 27.0420667
## 289 290 291 292 293 294
## 27.2136458 26.9139584 33.4356331 34.4034963 31.8333982 25.8178324
## 295 296 297 298 299 300
## 24.4298235 28.4576434 27.3626700 19.5392876 29.1130984 31.9105461
## 301 302 303 304 305 306
## 30.7715945 28.9427587 28.8819102 32.7988723 33.2090546 30.7683179
## 307 308 309 310 311 312
## 35.5622686 32.7090512 28.6424424 23.5896583 18.5426690 26.8788984
## 313 314 315 316 317 318
## 23.2813398 25.5458025 25.4812006 20.5390990 17.6157257 18.3758169
## 319 320 321 322 323 324
## 24.2907028 21.3252904 24.8868224 24.8693728 22.8695245 19.4512379
## 325 326 327 328 329 330
## 25.1178340 24.6678691 23.6807618 19.3408962 21.1741811 24.2524907
## 331 332 333 334 335 336
## 21.5926089 19.9844661 23.3388800 22.1406069 21.5550993 20.6187291
## 337 338 339 340 341 342
## 20.1609718 19.2849039 22.1667232 21.2496577 21.4293931 30.3278880
## 343 344 345 346 347 348
## 22.0473498 27.7064791 28.5479412 16.5450112 14.7835964 25.2738008
## 349 350 351 352 353 354
## 27.5420512 22.1483756 20.4594409 20.5460542 16.8806383 25.4025351
## 355 356 357 358 359 360
## 14.3248663 16.5948846 19.6370469 22.7180661 22.2021889 19.2054806
## 361 362 363 364 365 366
## 22.6661611 18.9319262 18.2284680 20.2315081 37.4944739 14.2819073
## 367 368 369 370 371 372
## 15.5428625 10.8316232 23.8007290 32.6440736 34.6068404 24.9433133
## 373 374 375 376 377 378
## 25.9998091 6.1263250 0.7777981 25.3071306 17.7406106 20.2327441
## 379 380 381 382 383 384
## 15.8333130 16.8351259 14.3699483 18.4768283 13.4276828 13.0617751
## 385 386 387 388 389 390
## 3.2791812 8.0602217 6.1284220 5.6186481 6.4519857 14.2076474
## 391 392 393 394 395 396
## 17.2122518 17.2988727 9.8911664 20.2212419 17.9418118 20.3044578
## 397 398 399 400 401 402
## 19.2955908 16.3363278 6.5516232 10.8901678 11.8814587 17.8117451
## 403 404 405 406 407 408
## 18.2612659 12.9794878 7.3781636 8.2111586 8.0662619 19.9829479
## 409 410 411 412 413 414
## 13.7075637 19.8526845 15.2230830 16.9607198 1.7185181 11.8057839
## 415 416 417 418 419 420
## -4.2813107 9.5837674 13.3666081 6.8956236 6.1477985 14.6066179
## 421 422 423 424 425 426
## 19.6000267 18.1242748 18.5217713 13.1752861 14.6261762 9.9237498
## 427 428 429 430 431 432
## 16.3459065 14.0751943 14.2575624 13.0423479 18.1595569 18.6955435
## 433 434 435 436 437 438
## 21.5272830 17.0314186 15.9609044 13.3614161 14.5207938 8.8197601
## 439 440 441 442 443 444
## 4.8675110 13.0659131 12.7060970 17.2955806 18.7404850 18.0590103
## 445 446 447 448 449 450
## 11.5147468 11.9740036 17.6834462 18.1269524 17.5183465 17.2274251
## 451 452 453 454 455 456
## 16.5227163 19.4129110 18.5821524 22.4894479 15.2800013 15.8208934
## 457 458 459 460 461 462
## 12.6872558 12.8763379 17.1866853 18.5124761 19.0486053 20.1720893
## 463 464 465 466 467 468
## 19.7740732 22.4294077 20.3191185 17.8861625 14.3747852 16.9477685
## 469 470 471 472 473 474
## 16.9840576 18.5883840 20.1671944 22.9771803 22.4558073 25.5782463
## 475 476 477 478 479 480
## 16.3914763 16.1114628 20.5348160 11.5427274 19.2049630 21.8627639
## 481 482 483 484 485 486
## 23.4687887 27.0988732 28.5699430 21.0839878 19.4551620 22.2222591
## 487 488 489 490 491 492
## 19.6559196 21.3253610 11.8558372 8.2238669 3.6639967 13.7590854
## 493 494 495 496 497 498
## 15.9311855 20.6266205 20.6124941 16.8854196 14.0132079 19.1085414
## 499 500 501 502 503 504
## 21.2980517 18.4549884 20.4687085 23.5333405 22.3757189 27.6274261
## 505 506
## 26.1279668 22.3442123
Chapter 2 - Classification Models
Logistic regression on mlbench::Sonar - classification models for categorical outcomes:
Confusion matrix - predicted outcomes vs. actual reality:
Class probabilities and class predictions - can modify thresholds for declaring positive depending on desired specificity vs. sensitivity:
Receive Operator Criteria - looking at many confusion matrices is time-consuming and non-scientific/systematic:
Area Under the Curve (AUC) - models that are more random will closely follow the diagonal line, while perfect models hit the upper-left corner:
Example code includes:
data(Sonar, package="mlbench")
# Shuffle row indices: rows
rows <- sample(nrow(Sonar), replace=FALSE)
# Randomly order data: Sonar
Sonar <- Sonar[rows, ]
# Identify row to split on: split
split <- round(nrow(Sonar) * 0.6)
# Create train
train <- Sonar[1:split, ]
# Create test
test <- Sonar[-(1:split), ]
# Fit glm model: model
model <- glm(Class ~ ., family="binomial", data=train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Predict on test: p
p <- predict(model, newdata=test, type="response")
# Calculate class probabilities: p_class
p_class <- ifelse(p > 0.5, "R", "M")
# Create confusion matrix
caret::confusionMatrix(p_class, test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 36 18
## R 11 18
##
## Accuracy : 0.6506
## 95% CI : (0.5381, 0.752)
## No Information Rate : 0.5663
## P-Value [Acc > NIR] : 0.0740
##
## Kappa : 0.2721
## Mcnemar's Test P-Value : 0.2652
##
## Sensitivity : 0.7660
## Specificity : 0.5000
## Pos Pred Value : 0.6667
## Neg Pred Value : 0.6207
## Prevalence : 0.5663
## Detection Rate : 0.4337
## Detection Prevalence : 0.6506
## Balanced Accuracy : 0.6330
##
## 'Positive' Class : M
##
# Apply threshold of 0.9: p_class
p_class <- ifelse(p > 0.9, "R", "M")
# Create confusion matrix
caret::confusionMatrix(p_class, test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 36 21
## R 11 15
##
## Accuracy : 0.6145
## 95% CI : (0.5012, 0.7193)
## No Information Rate : 0.5663
## P-Value [Acc > NIR] : 0.2198
##
## Kappa : 0.1888
## Mcnemar's Test P-Value : 0.1116
##
## Sensitivity : 0.7660
## Specificity : 0.4167
## Pos Pred Value : 0.6316
## Neg Pred Value : 0.5769
## Prevalence : 0.5663
## Detection Rate : 0.4337
## Detection Prevalence : 0.6867
## Balanced Accuracy : 0.5913
##
## 'Positive' Class : M
##
# Apply threshold of 0.10: p_class
p_class <- ifelse(p > 0.1, "R", "M")
# Create confusion matrix
confusionMatrix(p_class, test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 35 18
## R 12 18
##
## Accuracy : 0.6386
## 95% CI : (0.5257, 0.7412)
## No Information Rate : 0.5663
## P-Value [Acc > NIR] : 0.1110
##
## Kappa : 0.2495
## Mcnemar's Test P-Value : 0.3613
##
## Sensitivity : 0.7447
## Specificity : 0.5000
## Pos Pred Value : 0.6604
## Neg Pred Value : 0.6000
## Prevalence : 0.5663
## Detection Rate : 0.4217
## Detection Prevalence : 0.6386
## Balanced Accuracy : 0.6223
##
## 'Positive' Class : M
##
# Predict on test: p
p <- predict(model, newdata=test, type="response")
# Make ROC curve
caTools::colAUC(p, test$Class, plotROC=TRUE)
## [,1]
## M vs. R 0.6385934
# Create trainControl object: myControl
myControl <- caret::trainControl(
method = "cv",
number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE
)
# Train glm with custom trainControl: model
model <- caret::train(Class ~ ., data=Sonar, method="glm", trControl=myControl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
## + Fold01: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold01: parameter=none
## + Fold02: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold02: parameter=none
## + Fold03: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold03: parameter=none
## + Fold04: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold04: parameter=none
## + Fold05: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold05: parameter=none
## + Fold06: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold06: parameter=none
## + Fold07: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold07: parameter=none
## + Fold08: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold08: parameter=none
## + Fold09: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold09: parameter=none
## + Fold10: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Print model to console
model
## Generalized Linear Model
##
## 208 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 187, 187, 188, 187, 187, 187, ...
## Resampling results
##
## ROC Sens Spec ROC SD Sens SD Spec SD
## 0.7385774 0.7575758 0.67 0.117864 0.11866 0.1127107
##
##
Chapter 3 - Tuning Model Parameters
Random forests and wine - very robust against over-fitting, and frequently yield very accurate non-linear models:
Explore a wider model space - random forests require tuning (hyper-parameters):
Custom tuning grids - further customization of the tuneGrid data frame (most flexible, complete control of grid-search exploration):
Introducing glmnet - extension of generalized linear model (glm) with built-in variable selection:
Custom tuning grids with glmnet - ability to tune on both alpha and lambda:
Example code includes:
redWine <- read.csv("redWine.csv", sep=";")
str(redWine)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
whiteWine <- read.csv("whiteWine.csv", sep=";")
str(whiteWine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
nRed <- 24
nWhite <- 76
wine <- rbind(redWine[sample(1:nrow(redWine), nRed, replace=FALSE), ],
whiteWine[sample(1:nrow(whiteWine), nWhite, replace=FALSE), ]
)
wine$color <- factor(c(rep("red", nRed), rep("white", nWhite)),
levels=c("red", "white")
)
str(wine)
## 'data.frame': 100 obs. of 13 variables:
## $ fixed.acidity : num 8.5 7.4 10.7 8.9 6.6 8 8 6.5 6.8 9.3 ...
## $ volatile.acidity : num 0.66 0.785 0.43 0.28 0.8 0.715 0.705 0.46 0.56 0.49 ...
## $ citric.acid : num 0.2 0.19 0.39 0.45 0.03 0.22 0.05 0.14 0.03 0.36 ...
## $ residual.sugar : num 2.1 5.2 2.2 1.7 7.8 2.3 1.9 2.4 1.7 1.7 ...
## $ chlorides : num 0.097 0.094 0.106 0.067 0.079 0.075 0.074 0.114 0.084 0.081 ...
## $ free.sulfur.dioxide : num 23 19 8 7 6 13 8 9 18 3 ...
## $ total.sulfur.dioxide: num 113 98 32 12 12 81 19 37 35 14 ...
## $ density : num 0.997 0.997 0.999 0.994 0.996 ...
## $ pH : num 3.13 3.16 2.89 3.25 3.52 3.24 3.34 3.66 3.44 3.27 ...
## $ sulphates : num 0.48 0.52 0.5 0.55 0.5 0.54 0.95 0.65 0.63 0.78 ...
## $ alcohol : num 9.2 9.57 9.6 12.3 12.2 ...
## $ quality : int 5 6 5 7 5 6 6 5 6 6 ...
## $ color : Factor w/ 2 levels "red","white": 1 1 1 1 1 1 1 1 1 1 ...
# Fit random forest: model
model <- caret::train(
quality ~ .,
tuneLength = 1,
data = wine, method = "ranger",
trControl = caret::trainControl(method = "cv", number = 5, verboseIter = TRUE)
)
## Loading required package: e1071
## Warning: package 'e1071' was built under R version 3.2.4
## Loading required package: ranger
## Warning: package 'ranger' was built under R version 3.2.5
## + Fold1: mtry=3
## - Fold1: mtry=3
## + Fold2: mtry=3
## - Fold2: mtry=3
## + Fold3: mtry=3
## - Fold3: mtry=3
## + Fold4: mtry=3
## - Fold4: mtry=3
## + Fold5: mtry=3
## - Fold5: mtry=3
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Random Forest
##
## 100 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 80, 80, 80, 80, 80
## Resampling results
##
## RMSE Rsquared RMSE SD Rsquared SD
## 0.8237124 0.2156907 0.164984 0.199994
##
## Tuning parameter 'mtry' was held constant at a value of 3
##
# Fit random forest: model
model <- caret::train(
quality ~ .,
tuneLength = 3,
data = wine, method = "ranger",
trControl = caret::trainControl(method = "cv", number = 5, verboseIter = TRUE)
)
## + Fold1: mtry= 2
## - Fold1: mtry= 2
## + Fold1: mtry= 7
## - Fold1: mtry= 7
## + Fold1: mtry=12
## - Fold1: mtry=12
## + Fold2: mtry= 2
## - Fold2: mtry= 2
## + Fold2: mtry= 7
## - Fold2: mtry= 7
## + Fold2: mtry=12
## - Fold2: mtry=12
## + Fold3: mtry= 2
## - Fold3: mtry= 2
## + Fold3: mtry= 7
## - Fold3: mtry= 7
## + Fold3: mtry=12
## - Fold3: mtry=12
## + Fold4: mtry= 2
## - Fold4: mtry= 2
## + Fold4: mtry= 7
## - Fold4: mtry= 7
## + Fold4: mtry=12
## - Fold4: mtry=12
## + Fold5: mtry= 2
## - Fold5: mtry= 2
## + Fold5: mtry= 7
## - Fold5: mtry= 7
## + Fold5: mtry=12
## - Fold5: mtry=12
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
# Print model to console
model
## Random Forest
##
## 100 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 80, 80, 80, 80, 80
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared RMSE SD Rsquared SD
## 2 0.8545207 0.1804462 0.1184670 0.2815883
## 7 0.8842893 0.1591474 0.1264552 0.2941139
## 12 0.8953490 0.1563613 0.1463967 0.2985249
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
# Plot model
plot(model)
# Fit random forest: model
model <- caret::train(
quality ~ .,
tuneGrid = data.frame(mtry=c(2, 3, 7)),
data = wine, method = "ranger",
trControl = caret::trainControl(method = "cv", number = 5, verboseIter = TRUE)
)
## + Fold1: mtry=2
## - Fold1: mtry=2
## + Fold1: mtry=3
## - Fold1: mtry=3
## + Fold1: mtry=7
## - Fold1: mtry=7
## + Fold2: mtry=2
## - Fold2: mtry=2
## + Fold2: mtry=3
## - Fold2: mtry=3
## + Fold2: mtry=7
## - Fold2: mtry=7
## + Fold3: mtry=2
## - Fold3: mtry=2
## + Fold3: mtry=3
## - Fold3: mtry=3
## + Fold3: mtry=7
## - Fold3: mtry=7
## + Fold4: mtry=2
## - Fold4: mtry=2
## + Fold4: mtry=3
## - Fold4: mtry=3
## + Fold4: mtry=7
## - Fold4: mtry=7
## + Fold5: mtry=2
## - Fold5: mtry=2
## + Fold5: mtry=3
## - Fold5: mtry=3
## + Fold5: mtry=7
## - Fold5: mtry=7
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
# Print model to console
model
## Random Forest
##
## 100 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 79, 81, 81, 80, 79
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared RMSE SD Rsquared SD
## 2 0.8419694 0.2078658 0.1536963 0.1362576
## 3 0.8499580 0.1956421 0.1566934 0.1425106
## 7 0.8604436 0.1808511 0.1574472 0.1453864
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
# Plot model
plot(model)
# Create custom trainControl: myControl
myControl <- caret::trainControl(
method = "cv", number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE
)
## DO NOT HAVE (AND CANNOT FIND) DATASET "overfit"
# Fit glmnet model: model
# model <- caret::train(
# y ~ ., data=overfit,
# method = "glmnet",
# trControl = myControl
# )
# Print model to console
# model
# Print maximum ROC statistic
# max(model$results$ROC)
# Train glmnet with custom trainControl and tuning: model
# model <- caret::train(
# y ~ ., data=overfit,
# tuneGrid = expand.grid(alpha=0:1, lambda=seq(0.0001, 1, length=100)),
# method = "glmnet",
# trControl = myControl
# )
# Print model to console
# model
# Print maximum ROC statistic
# max(model$results$ROC)
Chapter 4 - Pre-processing data